From d76bb7a09bb3b8711077912f3e80cfcf39cd9d0b Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 27 May 2020 00:38:38 -0400 Subject: tools/power turbostat: Print /dev/cpu_dma_latency Users are puzzled when they use tuned performance and all their C-states vanish. Dump /dev/cpu_dma_latency and state whether the value is default, or constraining, to explain this situation. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 33b370865d16..4c679568fda4 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4698,6 +4698,32 @@ unsigned int intel_model_duplicates(unsigned int model) } return model; } + +void print_dev_latency(void) +{ + char *path = "/dev/cpu_dma_latency"; + int fd; + int value; + int retval; + + fd = open(path, O_RDONLY); + if (fd < 0) { + warn("fopen %s\n", path); + return; + } + + retval = read(fd, (void *)&value, sizeof(int)); + if (retval != sizeof(int)) { + warn("read %s\n", path); + close(fd); + return; + } + fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", + value, value == 2000000000 ? "default" : "constrained"); + + close(fd); +} + void process_cpuid() { unsigned int eax, ebx, ecx, edx; @@ -4966,6 +4992,8 @@ void process_cpuid() if (!quiet) dump_cstate_pstate_config_info(family, model); + if (!quiet) + print_dev_latency(); if (!quiet) dump_sysfs_cstate_config(); if (!quiet) -- cgit v1.2.3 From 9aefc2cda6353f48708415d9adc5dff4deb73412 Mon Sep 17 00:00:00 2001 From: Doug Smythies Date: Thu, 26 Mar 2020 13:36:37 -0700 Subject: tools/power turbostat: Always print idle in the system configuration header If the --quiet option is not used, turbostat prints a useful system configuration header during startup. But inclusion of idle system configuration information in this header is currently a function of inclusion in the columns chosen to be displayed. Always list this idle system configuration. Signed-off-by: Doug Smythies Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4c679568fda4..4edad3fc760a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3530,9 +3530,6 @@ dump_sysfs_cstate_config(void) int state; char *sp; - if (!DO_BIC(BIC_sysfs)) - return; - if (access("/sys/devices/system/cpu/cpuidle", R_OK)) { fprintf(outf, "cpuidle not loaded\n"); return; -- cgit v1.2.3 From 7c2ccc507bd44d17227930181f937b2066565349 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Sat, 18 Apr 2020 16:31:47 +0800 Subject: tools/power turbostat: Make the energy variable to be 64 bit Change the energy variable from 32bit to 64bit, so that it can record long time duration. After this conversion, adjust the DELTA_WRAP32() accordingly. Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4edad3fc760a..29ec1018852d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -211,12 +211,12 @@ struct pkg_data { long long gfx_rc6_ms; unsigned int gfx_mhz; unsigned int package_id; - unsigned int energy_pkg; /* MSR_PKG_ENERGY_STATUS */ - unsigned int energy_dram; /* MSR_DRAM_ENERGY_STATUS */ - unsigned int energy_cores; /* MSR_PP0_ENERGY_STATUS */ - unsigned int energy_gfx; /* MSR_PP1_ENERGY_STATUS */ - unsigned int rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ - unsigned int rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ + unsigned long long energy_pkg; /* MSR_PKG_ENERGY_STATUS */ + unsigned long long energy_dram; /* MSR_DRAM_ENERGY_STATUS */ + unsigned long long energy_cores; /* MSR_PP0_ENERGY_STATUS */ + unsigned long long energy_gfx; /* MSR_PP1_ENERGY_STATUS */ + unsigned long long rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ + unsigned long long rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned long long counter[MAX_ADDED_COUNTERS]; } *package_even, *package_odd; @@ -858,13 +858,13 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "pc10: %016llX\n", p->pc10); outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi); outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi); - outp += sprintf(outp, "Joules PKG: %0X\n", p->energy_pkg); - outp += sprintf(outp, "Joules COR: %0X\n", p->energy_cores); - outp += sprintf(outp, "Joules GFX: %0X\n", p->energy_gfx); - outp += sprintf(outp, "Joules RAM: %0X\n", p->energy_dram); - outp += sprintf(outp, "Throttle PKG: %0X\n", + outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg); + outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores); + outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx); + outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram); + outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status); - outp += sprintf(outp, "Throttle RAM: %0X\n", + outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); @@ -1210,11 +1210,7 @@ void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_ } #define DELTA_WRAP32(new, old) \ - if (new > old) { \ - old = new - old; \ - } else { \ - old = 0x100000000 + new - old; \ - } + old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32); int delta_package(struct pkg_data *new, struct pkg_data *old) -- cgit v1.2.3 From 87e15da95775a2ffb8c444e84f08ca982b758364 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Sat, 18 Apr 2020 16:31:57 +0800 Subject: tools/power turbostat: Introduce functions to accumulate RAPL consumption Since the RAPL Joule Counter is 32 bit, turbostat would only print a *star* instead of printing the actual energy consumed to indicate the overflow due to long duration. This does not meet the requirement from servers as the sampling time of turbostat is usually very long on servers. So maintain a set of MSR buffer, and update them periodically before the 32bit MSR register is wrapped round, so as to avoid the overflow. The idea is similar to the implementation of ktime_get(): Periodical MSR timer: total_rapl_sum += (current_rapl_msr - last_rapl_msr); Using get_msr_sum() to get the accumulated RAPL: return (current_rapl_msr - last_rapl_msr) + total_rapl_sum; The accumulated RAPL mechanism will be turned on in next patch. Originally-by: Aaron Lu Reviewed-by: Doug Smythies Tested-by: Doug Smythies Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/Makefile | 2 +- tools/power/x86/turbostat/turbostat.c | 224 +++++++++++++++++++++++++++++++++- 2 files changed, 219 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 2b6551269e43..d08765531bcb 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -16,7 +16,7 @@ override CFLAGS += -D_FORTIFY_SOURCE=2 %: %.c @mkdir -p $(BUILD_OUTPUT) - $(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) -lcap + $(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) -lcap -lrt .PHONY : clean clean : diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 29ec1018852d..c1759f6c84a8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -259,6 +259,113 @@ struct msr_counter { #define SYSFS_PERCPU (1 << 1) }; +/* + * The accumulated sum of MSR is defined as a monotonic + * increasing MSR, it will be accumulated periodically, + * despite its register's bit width. + */ +enum { + IDX_PKG_ENERGY, + IDX_DRAM_ENERGY, + IDX_PP0_ENERGY, + IDX_PP1_ENERGY, + IDX_PKG_PERF, + IDX_DRAM_PERF, + IDX_COUNT, +}; + +int get_msr_sum(int cpu, off_t offset, unsigned long long *msr); + +struct msr_sum_array { + /* get_msr_sum() = sum + (get_msr() - last) */ + struct { + /*The accumulated MSR value is updated by the timer*/ + unsigned long long sum; + /*The MSR footprint recorded in last timer*/ + unsigned long long last; + } entries[IDX_COUNT]; +}; + +/* The percpu MSR sum array.*/ +struct msr_sum_array *per_cpu_msr_sum; + +int idx_to_offset(int idx) +{ + int offset; + + switch (idx) { + case IDX_PKG_ENERGY: + offset = MSR_PKG_ENERGY_STATUS; + break; + case IDX_DRAM_ENERGY: + offset = MSR_DRAM_ENERGY_STATUS; + break; + case IDX_PP0_ENERGY: + offset = MSR_PP0_ENERGY_STATUS; + break; + case IDX_PP1_ENERGY: + offset = MSR_PP1_ENERGY_STATUS; + break; + case IDX_PKG_PERF: + offset = MSR_PKG_PERF_STATUS; + break; + case IDX_DRAM_PERF: + offset = MSR_DRAM_PERF_STATUS; + break; + default: + offset = -1; + } + return offset; +} + +int offset_to_idx(int offset) +{ + int idx; + + switch (offset) { + case MSR_PKG_ENERGY_STATUS: + idx = IDX_PKG_ENERGY; + break; + case MSR_DRAM_ENERGY_STATUS: + idx = IDX_DRAM_ENERGY; + break; + case MSR_PP0_ENERGY_STATUS: + idx = IDX_PP0_ENERGY; + break; + case MSR_PP1_ENERGY_STATUS: + idx = IDX_PP1_ENERGY; + break; + case MSR_PKG_PERF_STATUS: + idx = IDX_PKG_PERF; + break; + case MSR_DRAM_PERF_STATUS: + idx = IDX_DRAM_PERF; + break; + default: + idx = -1; + } + return idx; +} + +int idx_valid(int idx) +{ + switch (idx) { + case IDX_PKG_ENERGY: + return do_rapl & RAPL_PKG; + case IDX_DRAM_ENERGY: + return do_rapl & RAPL_DRAM; + case IDX_PP0_ENERGY: + return do_rapl & RAPL_CORES_ENERGY_STATUS; + case IDX_PP1_ENERGY: + return do_rapl & RAPL_GFX; + case IDX_PKG_PERF: + return do_rapl & RAPL_PKG_PERF_STATUS; + case IDX_DRAM_PERF: + return do_rapl & RAPL_DRAM_PERF_STATUS; + default: + return 0; + } +} struct sys_counters { unsigned int added_thread_counters; unsigned int added_core_counters; @@ -1250,12 +1357,12 @@ delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_mhz = new->gfx_mhz; - DELTA_WRAP32(new->energy_pkg, old->energy_pkg); - DELTA_WRAP32(new->energy_cores, old->energy_cores); - DELTA_WRAP32(new->energy_gfx, old->energy_gfx); - DELTA_WRAP32(new->energy_dram, old->energy_dram); - DELTA_WRAP32(new->rapl_pkg_perf_status, old->rapl_pkg_perf_status); - DELTA_WRAP32(new->rapl_dram_perf_status, old->rapl_dram_perf_status); + old->energy_pkg = new->energy_pkg - old->energy_pkg; + old->energy_cores = new->energy_cores - old->energy_cores; + old->energy_gfx = new->energy_gfx - old->energy_gfx; + old->energy_dram = new->energy_dram - old->energy_dram; + old->rapl_pkg_perf_status = new->rapl_pkg_perf_status - old->rapl_pkg_perf_status; + old->rapl_dram_perf_status = new->rapl_dram_perf_status - old->rapl_dram_perf_status; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -3053,6 +3160,111 @@ void do_sleep(void) } } +int get_msr_sum(int cpu, off_t offset, unsigned long long *msr) +{ + int ret, idx; + unsigned long long msr_cur, msr_last; + + if (!per_cpu_msr_sum) + return 1; + + idx = offset_to_idx(offset); + if (idx < 0) + return idx; + /* get_msr_sum() = sum + (get_msr() - last) */ + ret = get_msr(cpu, offset, &msr_cur); + if (ret) + return ret; + msr_last = per_cpu_msr_sum[cpu].entries[idx].last; + DELTA_WRAP32(msr_cur, msr_last); + *msr = msr_last + per_cpu_msr_sum[cpu].entries[idx].sum; + + return 0; +} + +timer_t timerid; + +/* Timer callback, update the sum of MSRs periodically. */ +static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + int i, ret; + int cpu = t->cpu_id; + + for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) { + unsigned long long msr_cur, msr_last; + int offset; + + if (!idx_valid(i)) + continue; + offset = idx_to_offset(i); + if (offset < 0) + continue; + ret = get_msr(cpu, offset, &msr_cur); + if (ret) { + fprintf(outf, "Can not update msr(0x%x)\n", offset); + continue; + } + + msr_last = per_cpu_msr_sum[cpu].entries[i].last; + per_cpu_msr_sum[cpu].entries[i].last = msr_cur & 0xffffffff; + + DELTA_WRAP32(msr_cur, msr_last); + per_cpu_msr_sum[cpu].entries[i].sum += msr_last; + } + return 0; +} + +static void +msr_record_handler(union sigval v) +{ + for_all_cpus(update_msr_sum, EVEN_COUNTERS); +} + +void msr_sum_record(void) +{ + struct itimerspec its; + struct sigevent sev; + + per_cpu_msr_sum = calloc(topo.max_cpu_num + 1, sizeof(struct msr_sum_array)); + if (!per_cpu_msr_sum) { + fprintf(outf, "Can not allocate memory for long time MSR.\n"); + return; + } + /* + * Signal handler might be restricted, so use thread notifier instead. + */ + memset(&sev, 0, sizeof(struct sigevent)); + sev.sigev_notify = SIGEV_THREAD; + sev.sigev_notify_function = msr_record_handler; + + sev.sigev_value.sival_ptr = &timerid; + if (timer_create(CLOCK_REALTIME, &sev, &timerid) == -1) { + fprintf(outf, "Can not create timer.\n"); + goto release_msr; + } + + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 1; + /* + * A wraparound time has been calculated early. + * Some sources state that the peak power for a + * microprocessor is usually 1.5 times the TDP rating, + * use 2 * TDP for safety. + */ + its.it_interval.tv_sec = rapl_joule_counter_range / 2; + its.it_interval.tv_nsec = 0; + + if (timer_settime(timerid, 0, &its, NULL) == -1) { + fprintf(outf, "Can not set timer.\n"); + goto release_timer; + } + return; + + release_timer: + timer_delete(timerid); + release_msr: + free(per_cpu_msr_sum); +} void turbostat_loop() { -- cgit v1.2.3 From 9972d5d84d76982606806b2ce887f70c2f8ba60a Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Sat, 18 Apr 2020 16:32:05 +0800 Subject: tools/power turbostat: Enable accumulate RAPL display Enable the accumulated RAPL display by default. Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 38 +++++++++++++++-------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c1759f6c84a8..66c468262020 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1169,14 +1169,7 @@ int format_counters(struct thread_data *t, struct core_data *c, } } - /* - * If measurement interval exceeds minimum RAPL Joule Counter range, - * indicate that results are suspect by printing "**" in fraction place. - */ - if (interval_float < rapl_joule_counter_range) - fmt8 = "%s%.2f"; - else - fmt8 = "%6.0f**"; + fmt8 = "%s%.2f"; if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); @@ -2069,39 +2062,39 @@ retry: p->sys_lpi = cpuidle_cur_sys_lpi_us; if (do_rapl & RAPL_PKG) { - if (get_msr(cpu, MSR_PKG_ENERGY_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) return -13; - p->energy_pkg = msr & 0xFFFFFFFF; + p->energy_pkg = msr; } if (do_rapl & RAPL_CORES_ENERGY_STATUS) { - if (get_msr(cpu, MSR_PP0_ENERGY_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) return -14; - p->energy_cores = msr & 0xFFFFFFFF; + p->energy_cores = msr; } if (do_rapl & RAPL_DRAM) { - if (get_msr(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) return -15; - p->energy_dram = msr & 0xFFFFFFFF; + p->energy_dram = msr; } if (do_rapl & RAPL_GFX) { - if (get_msr(cpu, MSR_PP1_ENERGY_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) return -16; - p->energy_gfx = msr & 0xFFFFFFFF; + p->energy_gfx = msr; } if (do_rapl & RAPL_PKG_PERF_STATUS) { - if (get_msr(cpu, MSR_PKG_PERF_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) return -16; - p->rapl_pkg_perf_status = msr & 0xFFFFFFFF; + p->rapl_pkg_perf_status = msr; } if (do_rapl & RAPL_DRAM_PERF_STATUS) { - if (get_msr(cpu, MSR_DRAM_PERF_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) return -16; - p->rapl_dram_perf_status = msr & 0xFFFFFFFF; + p->rapl_dram_perf_status = msr; } if (do_rapl & RAPL_AMD_F17H) { - if (get_msr(cpu, MSR_PKG_ENERGY_STAT, &msr)) + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) return -13; - p->energy_pkg = msr & 0xFFFFFFFF; + p->energy_pkg = msr; } if (DO_BIC(BIC_PkgTmp)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) @@ -6101,6 +6094,7 @@ int main(int argc, char **argv) return 0; } + msr_sum_record(); /* * if any params left, it must be a command to fork */ -- cgit v1.2.3 From 8201a0285789fade1c5b031914577e2b27a64f05 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 29 Jun 2020 15:26:57 -0400 Subject: tools/power turbostat: Use sched_getcpu() instead of hardcoded cpu 0 Disabling cpu 0 results in an error turbostat: /sys/devices/system/cpu/cpu0/topology/thread_siblings: open failed: No such file or directory Use sched_getcpu() instead of a hardcoded cpu 0 to get the max cpu number. Signed-off-by: Prarit Bhargava Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 66c468262020..151a70f2311b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2865,12 +2865,19 @@ void re_initialize(void) void set_max_cpu_num(void) { FILE *filep; + int base_cpu; unsigned long dummy; + char pathname[64]; + base_cpu = sched_getcpu(); + if (base_cpu < 0) + err(1, "cannot find calling cpu ID"); + sprintf(pathname, + "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", + base_cpu); + + filep = fopen_or_die(pathname, "r"); topo.max_cpu_num = 0; - filep = fopen_or_die( - "/sys/devices/system/cpu/cpu0/topology/thread_siblings", - "r"); while (fscanf(filep, "%lx,", &dummy) == 1) topo.max_cpu_num += BITMASK_SIZE; fclose(filep); -- cgit v1.2.3 From b88cad57d4d32bb5c53cd8e0ce3a1971062142af Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Wed, 8 Jul 2020 12:55:30 +0200 Subject: tools/power turbostat: Replace HTTP links with HTTPS ones: TURBOSTAT UTILITY Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index a6db83a88e85..f6b7e85b121c 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -335,7 +335,7 @@ that they count at TSC rate, which is true on all processors tested to date. .SH REFERENCES Volume 3B: System Programming Guide" -http://www.intel.com/products/processor/manuals/ +https://www.intel.com/products/processor/manuals/ .SH FILES .ta -- cgit v1.2.3 From fecb3bc839df64761cc63c9ee9b45c1cad36aee8 Mon Sep 17 00:00:00 2001 From: David Arcari Date: Mon, 10 Aug 2020 10:43:30 -0400 Subject: tools/power turbostat: Fix output formatting for ACPI CST enumeration turbostat formatting is broken with ACPI CST for enumeration. The problem is that the CX_ACPI% is eight characters long which does not work with tab formatting. One simple solution is to remove the underbar from the state name such that C1_ACPI will be displayed as C1ACPI. Signed-off-by: David Arcari Cc: Len Brown Cc: linux-kernel@vger.kernel.org Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 151a70f2311b..56b93e0d9415 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3682,6 +3682,20 @@ int has_config_tdp(unsigned int family, unsigned int model) } } +static void +remove_underbar(char *s) +{ + char *to = s; + + while (*s) { + if (*s != '_') + *to++ = *s; + s++; + } + + *to = 0; +} + static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model) { @@ -3764,6 +3778,8 @@ dump_sysfs_cstate_config(void) *sp = '\0'; fclose(input); + remove_underbar(name_buf); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state); input = fopen(path, "r"); @@ -5830,6 +5846,8 @@ void probe_sysfs(void) *sp = '%'; *(sp + 1) = '\0'; + remove_underbar(name_buf); + fclose(input); sprintf(path, "cpuidle/state%d/time", state); @@ -5857,6 +5875,8 @@ void probe_sysfs(void) *sp = '\0'; fclose(input); + remove_underbar(name_buf); + sprintf(path, "cpuidle/state%d/usage", state); if (is_deferred_skip(name_buf)) -- cgit v1.2.3 From e7af1ed3fa4756e8df8270a8635d852a94266061 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 13 Aug 2020 19:06:03 -0400 Subject: tools/power turbostat: Support additional CPU model numbers Initial support for models recently added to intel-family.h. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 56b93e0d9415..4ee4e3067681 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4906,6 +4906,9 @@ unsigned int intel_model_duplicates(unsigned int model) case INTEL_FAM6_ICELAKE_NNPI: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: + case INTEL_FAM6_LAKEFIELD: + case INTEL_FAM6_ALDERLAKE: return INTEL_FAM6_CANNONLAKE_L; case INTEL_FAM6_ATOM_TREMONT_D: @@ -4915,6 +4918,7 @@ unsigned int intel_model_duplicates(unsigned int model) return INTEL_FAM6_ATOM_TREMONT; case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: return INTEL_FAM6_SKYLAKE_X; } return model; -- cgit v1.2.3 From c315a09b1b0f491c27d46e9d05f397023a44fb81 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 13 Aug 2020 19:18:22 -0400 Subject: tools/power turbostat: Skip pc8, pc9, pc10 columns, if they are disabled Like we skip PC3 and PC6 columns when the package C-state limit disables them, skip PC8/PC9/CP10 under analogous conditions. Reported-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4ee4e3067681..72c0b19db36e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5186,9 +5186,12 @@ void process_cpuid() BIC_NOT_PRESENT(BIC_Pkgpc7); } if (has_c8910_msrs(family, model)) { - BIC_PRESENT(BIC_Pkgpc8); - BIC_PRESENT(BIC_Pkgpc9); - BIC_PRESENT(BIC_Pkgpc10); + if (pkg_cstate_limit >= PCL__8) + BIC_PRESENT(BIC_Pkgpc8); + if (pkg_cstate_limit >= PCL__9) + BIC_PRESENT(BIC_Pkgpc9); + if (pkg_cstate_limit >= PCL_10) + BIC_PRESENT(BIC_Pkgpc10); } do_irtl_hsw = has_c8910_msrs(family, model); if (has_skl_msrs(family, model)) { -- cgit v1.2.3 From 0936cdfbb527a4fa2559292069ebff2e8cf2c843 Mon Sep 17 00:00:00 2001 From: Ondřej Lysoněk Date: Fri, 27 Mar 2020 08:27:12 +0100 Subject: tools/power x86_energy_perf_policy: Input/output error in a VM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I've encountered an issue with x86_energy_perf_policy. If I run it on a machine that I'm told is a qemu-kvm virtual machine running inside a privileged container, I get the following error: x86_energy_perf_policy: /dev/cpu/0/msr offset 0x1ad read failed: Input/output error I get the same error in a Digital Ocean droplet, so that might be a similar environment. I created the following patch which is intended to give a more user-friendly message. It's based on a patch for turbostat from Prarit Bhargava that was posted some time ago. The patch is "[v2] turbostat: Running on virtual machine is not supported" [1]. Given my limited knowledge of the topic, I can't say with confidence that this is the right solution, though (that's why this is not an official patch submission). Also, I'm not sure what the convention with exit codes is in this tool. Also, instead of the error message, perhaps the tool should just not print anything in this case, which is how it behaves in a "regular" VM? [1] https://patchwork.kernel.org/patch/9868587/ Signed-off-by: Ondřej Lysoněk Signed-off-by: Len Brown --- .../x86_energy_perf_policy.c | 67 +++++++++++++++++----- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index 3fe1eed900d4..ff6c6661f075 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -622,6 +622,57 @@ void cmdline(int argc, char **argv) } } +/* + * Open a file, and exit on failure + */ +FILE *fopen_or_die(const char *path, const char *mode) +{ + FILE *filep = fopen(path, "r"); + + if (!filep) + err(1, "%s: open failed", path); + return filep; +} + +void err_on_hypervisor(void) +{ + FILE *cpuinfo; + char *flags, *hypervisor; + char *buffer; + + /* On VMs /proc/cpuinfo contains a "flags" entry for hypervisor */ + cpuinfo = fopen_or_die("/proc/cpuinfo", "ro"); + + buffer = malloc(4096); + if (!buffer) { + fclose(cpuinfo); + err(-ENOMEM, "buffer malloc fail"); + } + + if (!fread(buffer, 1024, 1, cpuinfo)) { + fclose(cpuinfo); + free(buffer); + err(1, "Reading /proc/cpuinfo failed"); + } + + flags = strstr(buffer, "flags"); + rewind(cpuinfo); + fseek(cpuinfo, flags - buffer, SEEK_SET); + if (!fgets(buffer, 4096, cpuinfo)) { + fclose(cpuinfo); + free(buffer); + err(1, "Reading /proc/cpuinfo failed"); + } + fclose(cpuinfo); + + hypervisor = strstr(buffer, "hypervisor"); + + free(buffer); + + if (hypervisor) + err(-1, + "not supported on this virtual machine"); +} int get_msr(int cpu, int offset, unsigned long long *msr) { @@ -635,8 +686,10 @@ int get_msr(int cpu, int offset, unsigned long long *msr) err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname); retval = pread(fd, msr, sizeof(*msr), offset); - if (retval != sizeof(*msr)) + if (retval != sizeof(*msr)) { + err_on_hypervisor(); err(-1, "%s offset 0x%llx read failed", pathname, (unsigned long long)offset); + } if (debug > 1) fprintf(stderr, "get_msr(cpu%d, 0x%X, 0x%llX)\n", cpu, offset, *msr); @@ -1086,18 +1139,6 @@ int update_cpu_msrs(int cpu) return 0; } -/* - * Open a file, and exit on failure - */ -FILE *fopen_or_die(const char *path, const char *mode) -{ - FILE *filep = fopen(path, "r"); - - if (!filep) - err(1, "%s: open failed", path); - return filep; -} - unsigned int get_pkg_num(int cpu) { FILE *fp; -- cgit v1.2.3 From b4b9156953fea108a9540c262e48eafeeff99ab0 Mon Sep 17 00:00:00 2001 From: Rafael Antognolli Date: Wed, 22 Apr 2020 15:02:07 -0700 Subject: tools/power turbostat: Add a new GFXAMHz column that exposes gt_act_freq_mhz. The column already present called GFXMHz reads from gt_cur_freq_mhz, which represents the GT frequency that was requested, but power management might not be able to do that. So the new column will display what the actual frequency GT is running at. Signed-off-by: Rafael Antognolli Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 72c0b19db36e..7a6e91aedf0f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -79,6 +79,7 @@ unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; unsigned int gfx_cur_mhz; +unsigned int gfx_act_mhz; unsigned int tcc_activation_temp; unsigned int tcc_activation_temp_override; double rapl_power_units, rapl_time_units; @@ -210,6 +211,7 @@ struct pkg_data { unsigned long long pkg_both_core_gfxe_c0; long long gfx_rc6_ms; unsigned int gfx_mhz; + unsigned int gfx_act_mhz; unsigned int package_id; unsigned long long energy_pkg; /* MSR_PKG_ENERGY_STATUS */ unsigned long long energy_dram; /* MSR_DRAM_ENERGY_STATUS */ @@ -558,6 +560,7 @@ struct msr_counter bic[] = { { 0x0, "APIC" }, { 0x0, "X2APIC" }, { 0x0, "Die" }, + { 0x0, "GFXAMHz" }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -612,6 +615,7 @@ struct msr_counter bic[] = { #define BIC_APIC (1ULL << 48) #define BIC_X2APIC (1ULL << 49) #define BIC_Die (1ULL << 50) +#define BIC_GFXACTMHz (1ULL << 51) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -831,6 +835,9 @@ void print_header(char *delim) if (DO_BIC(BIC_GFXMHz)) outp += sprintf(outp, "%sGFXMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_GFXACTMHz)) + outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : "")); if (DO_BIC(BIC_Any_c0)) @@ -1198,6 +1205,10 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_GFXMHz)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_mhz); + /* GFXACTMHz */ + if (DO_BIC(BIC_GFXACTMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz); + /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */ if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0/tsc); @@ -1349,6 +1360,7 @@ delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms; old->gfx_mhz = new->gfx_mhz; + old->gfx_act_mhz = new->gfx_act_mhz; old->energy_pkg = new->energy_pkg - old->energy_pkg; old->energy_cores = new->energy_cores - old->energy_cores; @@ -1565,6 +1577,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->gfx_rc6_ms = 0; p->gfx_mhz = 0; + p->gfx_act_mhz = 0; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) t->counter[i] = 0; @@ -1660,6 +1673,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, average.packages.gfx_rc6_ms = p->gfx_rc6_ms; average.packages.gfx_mhz = p->gfx_mhz; + average.packages.gfx_act_mhz = p->gfx_act_mhz; average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); @@ -2108,6 +2122,9 @@ retry: if (DO_BIC(BIC_GFXMHz)) p->gfx_mhz = gfx_cur_mhz; + if (DO_BIC(BIC_GFXACTMHz)) + p->gfx_act_mhz = gfx_act_mhz; + for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &p->counter[i])) return -10; @@ -3018,6 +3035,33 @@ int snapshot_gfx_mhz(void) return 0; } +/* + * snapshot_gfx_cur_mhz() + * + * record snapshot of + * /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz + * + * return 1 if config change requires a restart, else return 0 + */ +int snapshot_gfx_act_mhz(void) +{ + static FILE *fp; + int retval; + + if (fp == NULL) + fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); + else { + rewind(fp); + fflush(fp); + } + + retval = fscanf(fp, "%d", &gfx_act_mhz); + if (retval != 1) + err(1, "GFX ACT MHz"); + + return 0; +} + /* * snapshot_cpu_lpi() * @@ -3083,6 +3127,9 @@ int snapshot_proc_sysfs_files(void) if (DO_BIC(BIC_GFXMHz)) snapshot_gfx_mhz(); + if (DO_BIC(BIC_GFXACTMHz)) + snapshot_gfx_act_mhz(); + if (DO_BIC(BIC_CPU_LPI)) snapshot_cpu_lpi_us(); @@ -5236,6 +5283,9 @@ void process_cpuid() if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) BIC_PRESENT(BIC_GFXMHz); + if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + BIC_PRESENT(BIC_GFXACTMHz); + if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK)) BIC_PRESENT(BIC_CPU_LPI); else -- cgit v1.2.3 From 20de0dab238849414d33c81bc96e2db68cc61467 Mon Sep 17 00:00:00 2001 From: Antti Laakso Date: Mon, 17 Aug 2020 18:03:48 +0300 Subject: tools/power turbostat: Remove empty columns for Jacobsville Jacobsville doesn't have Package C2 and C6. Also Core and DRAM RAPL are not available. Adjust output accordingly. Signed-off-by: Antti Laakso Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 7a6e91aedf0f..629b809075c1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2286,6 +2286,7 @@ int has_turbo_ratio_group_limits(int family, int model) case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ATOM_GOLDMONT_D: + case INTEL_FAM6_ATOM_TREMONT_D: return 1; } return 0; @@ -3534,6 +3535,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ case INTEL_FAM6_ATOM_TREMONT: /* EHL */ + case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ pkg_cstate_limits = glm_pkg_cstate_limits; break; default: @@ -3616,6 +3618,17 @@ int is_ehl(unsigned int family, unsigned int model) } return 0; } +int is_jvl(unsigned int family, unsigned int model) +{ + if (!genuine_intel) + return 0; + + switch (model) { + case INTEL_FAM6_ATOM_TREMONT_D: + return 1; + } + return 0; +} int has_turbo_ratio_limit(unsigned int family, unsigned int model) { @@ -4227,6 +4240,14 @@ void rapl_probe_intel(unsigned int family, unsigned int model) BIC_PRESENT(BIC_GFXWatt); } break; + case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ + do_rapl = RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; + BIC_PRESENT(BIC_PKG__); + if (rapl_joules) + BIC_PRESENT(BIC_Pkg_J); + else + BIC_PRESENT(BIC_PkgWatt); + break; case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO; @@ -4629,6 +4650,7 @@ int has_snb_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ case INTEL_FAM6_ATOM_TREMONT: /* EHL */ + case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ return 1; } return 0; @@ -4958,9 +4980,6 @@ unsigned int intel_model_duplicates(unsigned int model) case INTEL_FAM6_ALDERLAKE: return INTEL_FAM6_CANNONLAKE_L; - case INTEL_FAM6_ATOM_TREMONT_D: - return INTEL_FAM6_ATOM_GOLDMONT_D; - case INTEL_FAM6_ATOM_TREMONT_L: return INTEL_FAM6_ATOM_TREMONT; @@ -5214,6 +5233,14 @@ void process_cpuid() BIC_PRESENT(BIC_Mod_c6); use_c1_residency_msr = 1; } + if (is_jvl(family, model)) { + BIC_NOT_PRESENT(BIC_CPU_c3); + BIC_NOT_PRESENT(BIC_CPU_c7); + BIC_NOT_PRESENT(BIC_Pkgpc2); + BIC_NOT_PRESENT(BIC_Pkgpc3); + BIC_NOT_PRESENT(BIC_Pkgpc6); + BIC_NOT_PRESENT(BIC_Pkgpc7); + } if (is_dnv(family, model)) { BIC_PRESENT(BIC_CPU_c1); BIC_NOT_PRESENT(BIC_CPU_c3); -- cgit v1.2.3 From 33eb82251af9be47a625ca1578f44e596a3a0ca9 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Mon, 17 Aug 2020 17:42:15 -0500 Subject: tools/power turbostat: Support AMD Family 19h Family 19h processors have the same RAPL (Running average power limit) hardware register interface as Family 17h processors. Change the family checks to succeed for Family 17h and above to enable core and package energy measurement on Family 19h machines. Also update the TDP to the largest found at the bottom of the page at amd.com->processors->servers->epyc->2nd-gen-epyc, i.e., the EPYC 7H12. Signed-off-by: Kim Phillips Cc: Len Brown Cc: Len Brown Cc: linux-pm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 629b809075c1..0869f791ed14 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4162,13 +4162,8 @@ double get_tdp_intel(unsigned int model) double get_tdp_amd(unsigned int family) { - switch (family) { - case 0x17: - case 0x18: - default: - /* This is the max stock TDP of HEDT/Server Fam17h chips */ - return 250.0; - } + /* This is the max stock TDP of HEDT/Server Fam17h+ chips */ + return 280.0; } /* @@ -4358,27 +4353,20 @@ void rapl_probe_amd(unsigned int family, unsigned int model) if (max_extended_level >= 0x80000007) { __cpuid(0x80000007, eax, ebx, ecx, edx); - /* RAPL (Fam 17h) */ + /* RAPL (Fam 17h+) */ has_rapl = edx & (1 << 14); } - if (!has_rapl) + if (!has_rapl || family < 0x17) return; - switch (family) { - case 0x17: /* Zen, Zen+ */ - case 0x18: /* Hygon Dhyana */ - do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - } - break; - default: - return; + do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY; + if (rapl_joules) { + BIC_PRESENT(BIC_Pkg_J); + BIC_PRESENT(BIC_Cor_J); + } else { + BIC_PRESENT(BIC_PkgWatt); + BIC_PRESENT(BIC_CorWatt); } if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr)) -- cgit v1.2.3 From 4be61e6b769fc3f97b58870aa4258e27968f07e1 Mon Sep 17 00:00:00 2001 From: Alexander Monakov Date: Sun, 23 Aug 2020 23:27:02 +0300 Subject: tools/power turbostat: Build with _FILE_OFFSET_BITS=64 For compatibility reasons, Glibc off_t is a 32-bit type on 32-bit x86 unless _FILE_OFFSET_BITS=64 is defined. Add this define, as otherwise reading MSRs with index 0x80000000 and above attempts a pread with a negative offset, which fails. Signed-off-by: Alexander Monakov Tested-by: Liwei Song Signed-off-by: Len Brown --- tools/power/x86/turbostat/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index d08765531bcb..f3e3c94ab9bd 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -12,6 +12,7 @@ turbostat : turbostat.c override CFLAGS += -O2 -Wall -I../../../include override CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"' override CFLAGS += -DINTEL_FAMILY_HEADER='"../../../../arch/x86/include/asm/intel-family.h"' +override CFLAGS += -D_FILE_OFFSET_BITS=64 override CFLAGS += -D_FORTIFY_SOURCE=2 %: %.c -- cgit v1.2.3 From 151a535171be6ff824a0a3875553ea38570f4c05 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 15 Oct 2020 21:41:44 +0100 Subject: genirq: Let GENERIC_IRQ_IPI select IRQ_DOMAIN_HIERARCHY kernel/irq/ipi.c otherwise fails to compile if nothing else selects it. Fixes: 379b656446a3 ("genirq: Add GENERIC_IRQ_IPI Kconfig symbol") Reported-by: Pavel Machek Tested-by: Pavel Machek Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201015101222.GA32747@amd --- kernel/irq/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 10a5aff4eecc..164a031cfdb6 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -82,6 +82,7 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS # Generic IRQ IPI support config GENERIC_IRQ_IPI bool + select IRQ_DOMAIN_HIERARCHY # Generic MSI interrupt support config GENERIC_MSI_IRQ -- cgit v1.2.3 From 61b0648d569aca932eab87a67f7ca0ffd3ea2b68 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 14 Oct 2020 15:17:03 +0200 Subject: irqchip/mst: MST_IRQ should depend on ARCH_MEDIATEK or ARCH_MSTARV7 The MStar interrupt controller is only found on MStar, SigmaStar, and Mediatek SoCs. Hence add dependencies on ARCH_MEDIATEK and ARCH_MSTARV7, to prevent asking the user about the MStar interrupt controller driver when configuring a kernel without support for MStar, SigmaStar, and Mediatek SoCs. Fixes: ad4c938c92af9130 ("irqchip/irq-mst: Add MStar interrupt controller support") Signed-off-by: Geert Uytterhoeven Signed-off-by: Marc Zyngier Acked-by: Daniel Palmer Link: https://lore.kernel.org/r/20201014131703.18021-1-geert+renesas@glider.be --- drivers/irqchip/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 570a7706875c..cd734df57c42 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -583,6 +583,7 @@ config LOONGSON_PCH_MSI config MST_IRQ bool "MStar Interrupt Controller" + depends on ARCH_MEDIATEK || ARCH_MSTARV7 || COMPILE_TEST default ARCH_MEDIATEK select IRQ_DOMAIN select IRQ_DOMAIN_HIERARCHY -- cgit v1.2.3 From 893a7cfb6b0bea650fafa43838d7f7f8f0f076bc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 15 Oct 2020 22:26:26 +0100 Subject: irqchip/mst: Make mst_intc_of_init static mst_intc_of_init has no external caller, so let's make it static. Reported-by: kernel test robot Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mst-intc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-mst-intc.c b/drivers/irqchip/irq-mst-intc.c index 4be077591898..143657b0cf28 100644 --- a/drivers/irqchip/irq-mst-intc.c +++ b/drivers/irqchip/irq-mst-intc.c @@ -154,8 +154,8 @@ static const struct irq_domain_ops mst_intc_domain_ops = { .free = irq_domain_free_irqs_common, }; -int __init -mst_intc_of_init(struct device_node *dn, struct device_node *parent) +static int __init mst_intc_of_init(struct device_node *dn, + struct device_node *parent) { struct irq_domain *domain, *domain_parent; struct mst_intc_chip_data *cd; -- cgit v1.2.3 From d26dd4131d0d6ad7aa294a7f8d18782b47c27c93 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 16 Oct 2020 09:28:23 +0100 Subject: irqchip/mips: Drop selection of IRQ_DOMAIN_HIERARCHY Now that GENERIC_IRQ_IPI selects IRQ_DOMAIN_HIERARCHY, there is no need to have this conditional select for IRQ_MIPS_CPU. Similarily, MIPS_GIC only needs selecting GENERIC_IRQ_IPI. Suggested-by: Thomas Gleixner Signed-off-by: Marc Zyngier --- drivers/irqchip/Kconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index cd734df57c42..38785a0fbb2a 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -180,7 +180,6 @@ config IRQ_MIPS_CPU select GENERIC_IRQ_CHIP select GENERIC_IRQ_IPI if SYS_SUPPORTS_MULTITHREADING select IRQ_DOMAIN - select IRQ_DOMAIN_HIERARCHY if GENERIC_IRQ_IPI select GENERIC_IRQ_EFFECTIVE_AFF_MASK config CLPS711X_IRQCHIP @@ -307,7 +306,6 @@ config KEYSTONE_IRQ config MIPS_GIC bool select GENERIC_IRQ_IPI - select IRQ_DOMAIN_HIERARCHY select MIPS_CM config INGENIC_IRQ -- cgit v1.2.3 From 6ff7cb371c4bea3dba03a56d774da925e78a5087 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 29 Sep 2020 17:28:42 -0400 Subject: tools/power turbostat: adjust for temperature offset cpu1: MSR_IA32_TEMPERATURE_TARGET: 0x05640000 (95 C) (100 default - 5 offset) Account for the new "offset" field in MSR_TEMPERATURE_TARGET. While this field is usually zero, ignoring it results in over-stating the current temperature, both per-core and per-package. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 62 ++++++++++++++++------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 0869f791ed14..4122468fb73b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4785,12 +4785,33 @@ double discover_bclk(unsigned int family, unsigned int model) * below this value, including the Digital Thermal Sensor (DTS), * Package Thermal Management Sensor (PTM), and thermal event thresholds. */ -int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int read_tcc_activation_temp() { unsigned long long msr; - unsigned int target_c_local; - int cpu; + unsigned int tcc, target_c, offset_c; + + /* Temperature Target MSR is Nehalem and newer only */ + if (!do_nhm_platform_info) + return 0; + + if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) + return 0; + target_c = (msr >> 16) & 0xFF; + + offset_c = (msr >> 24) & 0xF; + + tcc = target_c - offset_c; + + if (!quiet) + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", + base_cpu, msr, tcc, target_c, offset_c); + + return tcc; +} + +int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ /* tcc_activation_temp is used only for dts or ptm */ if (!(do_dts || do_ptm)) return 0; @@ -4799,43 +4820,18 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) return 0; - cpu = t->cpu_id; - if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); - return -1; - } - if (tcc_activation_temp_override != 0) { tcc_activation_temp = tcc_activation_temp_override; - fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", - cpu, tcc_activation_temp); + fprintf(outf, "Using cmdline TCC Target (%d C)\n", tcc_activation_temp); return 0; } - /* Temperature Target MSR is Nehalem and newer only */ - if (!do_nhm_platform_info) - goto guess; - - if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) - goto guess; - - target_c_local = (msr >> 16) & 0xFF; - - if (!quiet) - fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", - cpu, msr, target_c_local); - - if (!target_c_local) - goto guess; - - tcc_activation_temp = target_c_local; - - return 0; + tcc_activation_temp = read_tcc_activation_temp(); + if (tcc_activation_temp) + return 0; -guess: tcc_activation_temp = TJMAX_DEFAULT; - fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", - cpu, tcc_activation_temp); + fprintf(outf, "Guessing tjMax %d C, Please use -T to specify\n", tcc_activation_temp); return 0; } -- cgit v1.2.3 From 3d7772ea5602b88c7c7f0a50d512171a2eed6659 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 30 Sep 2020 20:58:15 -0400 Subject: tools/power turbostat: harden against cpu hotplug turbostat tends to get confused when CPUs are added and removed while it is running. There are races, such as checking the current cpu, and then reading a sysfs file that depends on that cpu number. Close the two issues that seem to come up the most. First, there is an infinite reset loop detector -- change that to allow more resets before giving up. Secondly, one of those file reads didn't really need to exit the program on failure... Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4122468fb73b..74d2fc6fc78a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1894,7 +1894,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int i; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu); return -1; } @@ -2764,7 +2764,12 @@ int get_thread_siblings(struct cpu_topology *thiscpu) sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu); - filep = fopen_or_die(path, "r"); + filep = fopen(path, "r"); + + if (!filep) { + warnx("%s: open failed", path); + return -1; + } do { offset -= BITMASK_SIZE; if (fscanf(filep, "%lx%c", &map, &character) != 2) @@ -2877,7 +2882,7 @@ void re_initialize(void) { free_all_buffers(); setup_all_buffers(); - printf("turbostat: re-initialized with num_cpus %d\n", topo.num_cpus); + fprintf(outf, "turbostat: re-initialized with num_cpus %d\n", topo.num_cpus); } void set_max_cpu_num(void) @@ -3331,7 +3336,7 @@ restart: if (retval < -1) { exit(retval); } else if (retval == -1) { - if (restarted > 1) { + if (restarted > 10) { exit(retval); } re_initialize(); @@ -3926,7 +3931,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "print_epb: Could not migrate to CPU %d\n", cpu); return -1; } @@ -3970,7 +3975,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "print_hwp: Could not migrate to CPU %d\n", cpu); return -1; } @@ -4058,7 +4063,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data return 0; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "print_perf_limit: Could not migrate to CPU %d\n", cpu); return -1; } @@ -4439,7 +4444,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p return 0; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu); return -1; } @@ -4511,7 +4516,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu = t->cpu_id; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "print_rapl: Could not migrate to CPU %d\n", cpu); return -1; } -- cgit v1.2.3 From 57733e009f0c7e0526e10a18be12f56996c5460e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 25 Oct 2020 11:10:29 +0000 Subject: irqchip/bcm2836: Fix missing __init annotation bcm2836_arm_irqchip_smp_init() calls set_smp_ipi_range(), which has an __init annotation. Make sure the caller has the same annotation. Reported-by: kernel test robot Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-bcm2836.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-bcm2836.c b/drivers/irqchip/irq-bcm2836.c index 97838eb705f9..cbc7c740e4dc 100644 --- a/drivers/irqchip/irq-bcm2836.c +++ b/drivers/irqchip/irq-bcm2836.c @@ -244,7 +244,7 @@ static int bcm2836_cpu_dying(unsigned int cpu) #define BITS_PER_MBOX 32 -static void bcm2836_arm_irqchip_smp_init(void) +static void __init bcm2836_arm_irqchip_smp_init(void) { struct irq_fwspec ipi_fwspec = { .fwnode = intc.domain->fwnode, -- cgit v1.2.3 From a00e85b581fd5ee47e770b6b8d2038dbebbe81f9 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Fri, 16 Oct 2020 16:40:17 +0200 Subject: irqchip/stm32-exti: Add all LP timer exti direct events support Add all remaining LP timer exti direct events, e.g. for LP Timer 2 to 5. LP timer 1 is already listed (e.g. exti 47). Signed-off-by: Fabrice Gasnier Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1602859219-15684-2-git-send-email-fabrice.gasnier@st.com --- drivers/irqchip/irq-stm32-exti.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c index 0c2c61db26b4..8662d7b7b262 100644 --- a/drivers/irqchip/irq-stm32-exti.c +++ b/drivers/irqchip/irq-stm32-exti.c @@ -195,6 +195,10 @@ static const struct stm32_desc_irq stm32mp1_desc_irq[] = { { .exti = 25, .irq_parent = 107, .chip = &stm32_exti_h_chip_direct }, { .exti = 30, .irq_parent = 52, .chip = &stm32_exti_h_chip_direct }, { .exti = 47, .irq_parent = 93, .chip = &stm32_exti_h_chip_direct }, + { .exti = 48, .irq_parent = 138, .chip = &stm32_exti_h_chip_direct }, + { .exti = 50, .irq_parent = 139, .chip = &stm32_exti_h_chip_direct }, + { .exti = 52, .irq_parent = 140, .chip = &stm32_exti_h_chip_direct }, + { .exti = 53, .irq_parent = 141, .chip = &stm32_exti_h_chip_direct }, { .exti = 54, .irq_parent = 135, .chip = &stm32_exti_h_chip_direct }, { .exti = 61, .irq_parent = 100, .chip = &stm32_exti_h_chip_direct }, { .exti = 65, .irq_parent = 144, .chip = &stm32_exti_h_chip }, -- cgit v1.2.3 From a7480c5d725c4ecfc627e70960f249c34f5d13e8 Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Tue, 20 Oct 2020 16:15:32 +0800 Subject: irqchip/sifive-plic: Fix broken irq_set_affinity() callback An interrupt submitted to an affinity change will always be left enabled after plic_set_affinity() has been called, while the expectation is that it should stay in whatever state it was before the call. Preserving the configuration fixes a PWM hang issue on the Unleashed board. [ 919.015783] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [ 919.020922] rcu: 0-...0: (0 ticks this GP) idle=7d2/1/0x4000000000000002 softirq=1424/1424 fqs=105807 [ 919.030295] (detected by 1, t=225825 jiffies, g=1561, q=3496) [ 919.036109] Task dump for CPU 0: [ 919.039321] kworker/0:1 R running task 0 30 2 0x00000008 [ 919.046359] Workqueue: events set_brightness_delayed [ 919.051302] Call Trace: [ 919.053738] [] __schedule+0x194/0x4de [ 982.035783] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [ 982.040923] rcu: 0-...0: (0 ticks this GP) idle=7d2/1/0x4000000000000002 softirq=1424/1424 fqs=113325 [ 982.050294] (detected by 1, t=241580 jiffies, g=1561, q=3509) [ 982.056108] Task dump for CPU 0: [ 982.059321] kworker/0:1 R running task 0 30 2 0x00000008 [ 982.066359] Workqueue: events set_brightness_delayed [ 982.071302] Call Trace: [ 982.073739] [] __schedule+0x194/0x4de [..] Fixes: bb0fed1c60cc ("irqchip/sifive-plic: Switch to fasteoi flow") Signed-off-by: Greentime Hu [maz: tidy-up commit message] Signed-off-by: Marc Zyngier Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20201020081532.2377-1-greentime.hu@sifive.com --- drivers/irqchip/irq-sifive-plic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index eaa3e9fe54e9..4048657ece0a 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -151,7 +151,7 @@ static int plic_set_affinity(struct irq_data *d, return -EINVAL; plic_irq_toggle(&priv->lmask, d, 0); - plic_irq_toggle(cpumask_of(cpu), d, 1); + plic_irq_toggle(cpumask_of(cpu), d, !irqd_irq_masked(d)); irq_data_update_effective_affinity(d, cpumask_of(cpu)); -- cgit v1.2.3 From 472547778de24e2764ab325268dd5b77e6923939 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 22 Oct 2020 13:27:38 -0700 Subject: selftest/bpf: Fix profiler test using CO-RE relocation for enums Instead of hard-coding invalid pids_cgrp_id, use Kconfig to detect the presence of that enum value and CO-RE to capture its actual value in the hosts's kernel. Fixes: 03d4d13fab3f ("selftests/bpf: Add profiler test") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Tested-by: Song Liu Link: https://lore.kernel.org/bpf/20201022202739.3667367-1-andrii@kernel.org --- tools/testing/selftests/bpf/progs/profiler.inc.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index 00578311a423..30982a7e4d0f 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -243,7 +243,10 @@ static ino_t get_inode_from_kernfs(struct kernfs_node* node) } } -int pids_cgrp_id = 1; +extern bool CONFIG_CGROUP_PIDS __kconfig __weak; +enum cgroup_subsys_id___local { + pids_cgrp_id___local = 123, /* value doesn't matter */ +}; static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, struct task_struct* task, @@ -253,7 +256,9 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn); struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn); - if (ENABLE_CGROUP_V1_RESOLVER) { + if (ENABLE_CGROUP_V1_RESOLVER && CONFIG_CGROUP_PIDS) { + int cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id___local, + pids_cgrp_id___local); #ifdef UNROLL #pragma unroll #endif @@ -262,7 +267,7 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, BPF_CORE_READ(task, cgroups, subsys[i]); if (subsys != NULL) { int subsys_id = BPF_CORE_READ(subsys, ss, id); - if (subsys_id == pids_cgrp_id) { + if (subsys_id == cgrp_id) { proc_kernfs = BPF_CORE_READ(subsys, cgroup, kn); root_kernfs = BPF_CORE_READ(subsys, ss, root, kf_root, kn); break; -- cgit v1.2.3 From 1384ab4fee12c4c4f8bd37bc9f8686881587b286 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Tue, 6 Oct 2020 16:02:50 +0300 Subject: usb: dwc3: pci: add support for the Intel Alder Lake-S This patch adds the necessary PCI ID for Intel Alder Lake-S devices. Signed-off-by: Heikki Krogerus Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/dwc3-pci.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c index 242b6210380a..bae6a70664c8 100644 --- a/drivers/usb/dwc3/dwc3-pci.c +++ b/drivers/usb/dwc3/dwc3-pci.c @@ -40,6 +40,7 @@ #define PCI_DEVICE_ID_INTEL_TGPLP 0xa0ee #define PCI_DEVICE_ID_INTEL_TGPH 0x43ee #define PCI_DEVICE_ID_INTEL_JSP 0x4dee +#define PCI_DEVICE_ID_INTEL_ADLS 0x7ae1 #define PCI_INTEL_BXT_DSM_GUID "732b85d5-b7a7-4a1b-9ba0-4bbd00ffd511" #define PCI_INTEL_BXT_FUNC_PMU_PWR 4 @@ -367,6 +368,9 @@ static const struct pci_device_id dwc3_pci_id_table[] = { { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_JSP), (kernel_ulong_t) &dwc3_pci_intel_properties, }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ADLS), + (kernel_ulong_t) &dwc3_pci_intel_properties, }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_NL_USB), (kernel_ulong_t) &dwc3_pci_amd_properties, }, { } /* Terminating Entry */ -- cgit v1.2.3 From 0d66e04875c5aae876cf3d4f4be7978fa2b00523 Mon Sep 17 00:00:00 2001 From: Evgeny Novikov Date: Fri, 2 Oct 2020 18:01:55 +0300 Subject: usb: gadget: goku_udc: fix potential crashes in probe goku_probe() goes to error label "err" and invokes goku_remove() in case of failures of pci_enable_device(), pci_resource_start() and ioremap(). goku_remove() gets a device from pci_get_drvdata(pdev) and works with it without any checks, in particular it dereferences a corresponding pointer. But goku_probe() did not set this device yet. So, one can expect various crashes. The patch moves setting the device just after allocation of memory for it. Found by Linux Driver Verification project (linuxtesting.org). Reported-by: Pavel Andrianov Signed-off-by: Evgeny Novikov Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/goku_udc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/goku_udc.c b/drivers/usb/gadget/udc/goku_udc.c index 25c1d6ab5adb..3e1267d38774 100644 --- a/drivers/usb/gadget/udc/goku_udc.c +++ b/drivers/usb/gadget/udc/goku_udc.c @@ -1760,6 +1760,7 @@ static int goku_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err; } + pci_set_drvdata(pdev, dev); spin_lock_init(&dev->lock); dev->pdev = pdev; dev->gadget.ops = &goku_ops; @@ -1793,7 +1794,6 @@ static int goku_probe(struct pci_dev *pdev, const struct pci_device_id *id) } dev->regs = (struct goku_udc_regs __iomem *) base; - pci_set_drvdata(pdev, dev); INFO(dev, "%s\n", driver_desc); INFO(dev, "version: " DRIVER_VERSION " %s\n", dmastr()); INFO(dev, "irq %d, pci mem %p\n", pdev->irq, base); -- cgit v1.2.3 From 48e7bbbbb261b007fe78aa14ae62df01d236497e Mon Sep 17 00:00:00 2001 From: Ran Wang Date: Fri, 16 Oct 2020 12:33:26 +0800 Subject: usb: gadget: fsl: fix null pointer checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fsl_ep_fifo_status() should return error if _ep->desc is null. Fixes: 75eaa498c99e (“usb: gadget: Correct NULL pointer checking in fsl gadget”) Reviewed-by: Peter Chen Signed-off-by: Ran Wang Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/fsl_udc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c index de528e3b0662..ad6ff9c4188e 100644 --- a/drivers/usb/gadget/udc/fsl_udc_core.c +++ b/drivers/usb/gadget/udc/fsl_udc_core.c @@ -1051,7 +1051,7 @@ static int fsl_ep_fifo_status(struct usb_ep *_ep) u32 bitmask; struct ep_queue_head *qh; - if (!_ep || _ep->desc || !(_ep->desc->bEndpointAddress&0xF)) + if (!_ep || !_ep->desc || !(_ep->desc->bEndpointAddress&0xF)) return -ENODEV; ep = container_of(_ep, struct fsl_ep, ep); -- cgit v1.2.3 From fa27e2f6c5e674f3f1225f9ca7a7821faaf393bb Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Thu, 22 Oct 2020 15:44:59 -0700 Subject: usb: dwc3: ep0: Fix delay status handling If we want to send a control status on our own time (through delayed_status), make sure to handle a case where we may queue the delayed status before the host requesting for it (when XferNotReady is generated). Otherwise, the driver won't send anything because it's not EP0_STATUS_PHASE yet. To resolve this, regardless whether dwc->ep0state is EP0_STATUS_PHASE, make sure to clear the dwc->delayed_status flag if dwc3_ep0_send_delayed_status() is called. The control status can be sent when the host requests it later. Cc: Fixes: d97c78a1908e ("usb: dwc3: gadget: END_TRANSFER before CLEAR_STALL command") Signed-off-by: Thinh Nguyen Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/ep0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index 7be3903cb842..8b668ef46f7f 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -1058,10 +1058,11 @@ void dwc3_ep0_send_delayed_status(struct dwc3 *dwc) { unsigned int direction = !dwc->ep0_expect_in; + dwc->delayed_status = false; + if (dwc->ep0state != EP0_STATUS_PHASE) return; - dwc->delayed_status = false; __dwc3_ep0_do_control_status(dwc, dwc->eps[direction]); } -- cgit v1.2.3 From 190bb01b72d2d5c3654a03c42fb1ad0dc6114c79 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sat, 17 Oct 2020 18:50:12 +0200 Subject: usb: dwc2: Avoid leaving the error_debugfs label unused MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The error_debugfs label is only used when either CONFIG_USB_DWC2_PERIPHERAL or CONFIG_USB_DWC2_DUAL_ROLE is enabled. Add the same #if to the error_debugfs label itself as the code which uses this label already has. This avoids the following compiler warning: warning: label ‘error_debugfs’ defined but not used [-Wunused-label] Fixes: e1c08cf23172ed ("usb: dwc2: Add missing cleanups when usb_add_gadget_udc() fails") Acked-by: Minas Harutyunyan Reported-by: kernel test robot Reported-by: Jens Axboe Signed-off-by: Martin Blumenstingl Signed-off-by: Felipe Balbi --- drivers/usb/dwc2/platform.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/dwc2/platform.c b/drivers/usb/dwc2/platform.c index e2820676beb1..5f18acac7406 100644 --- a/drivers/usb/dwc2/platform.c +++ b/drivers/usb/dwc2/platform.c @@ -608,10 +608,13 @@ static int dwc2_driver_probe(struct platform_device *dev) #endif /* CONFIG_USB_DWC2_PERIPHERAL || CONFIG_USB_DWC2_DUAL_ROLE */ return 0; +#if IS_ENABLED(CONFIG_USB_DWC2_PERIPHERAL) || \ + IS_ENABLED(CONFIG_USB_DWC2_DUAL_ROLE) error_debugfs: dwc2_debugfs_exit(hsotg); if (hsotg->hcd_enabled) dwc2_hcd_remove(hsotg); +#endif error_drd: dwc2_drd_exit(hsotg); -- cgit v1.2.3 From 129aa9734559a17990ee933351c7b6956f1dba62 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 27 Oct 2020 15:30:44 +0800 Subject: usb: raw-gadget: fix memory leak in gadget_setup When fetch 'event' from event queue, after copy its address space content to user space, the 'event' the memory space pointed to by the 'event' pointer need be freed. BUG: memory leak unreferenced object 0xffff888110622660 (size 32): comm "softirq", pid 0, jiffies 4294941981 (age 12.480s) hex dump (first 32 bytes): 02 00 00 00 08 00 00 00 80 06 00 01 00 00 40 00 ..............@. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000efd29abd>] kmalloc include/linux/slab.h:554 [inline] [<00000000efd29abd>] raw_event_queue_add drivers/usb/gadget/legacy/raw_gadget.c:66 [inline] [<00000000efd29abd>] raw_queue_event drivers/usb/gadget/legacy/raw_gadget.c:225 [inline] [<00000000efd29abd>] gadget_setup+0xf6/0x220 drivers/usb/gadget/legacy/raw_gadget.c:343 [<00000000952c4a46>] dummy_timer+0xb9f/0x14c0 drivers/usb/gadget/udc/dummy_hcd.c:1899 [<0000000074ac2c54>] call_timer_fn+0x38/0x200 kernel/time/timer.c:1415 [<00000000560a3a79>] expire_timers kernel/time/timer.c:1460 [inline] [<00000000560a3a79>] __run_timers.part.0+0x319/0x400 kernel/time/timer.c:1757 [<000000009d9503d0>] __run_timers kernel/time/timer.c:1738 [inline] [<000000009d9503d0>] run_timer_softirq+0x3d/0x80 kernel/time/timer.c:1770 [<000000009df27c89>] __do_softirq+0xcc/0x2c2 kernel/softirq.c:298 [<000000007a3f1a47>] asm_call_irq_on_stack+0xf/0x20 [<000000004a62cc2e>] __run_on_irqstack arch/x86/include/asm/irq_stack.h:26 [inline] [<000000004a62cc2e>] run_on_irqstack_cond arch/x86/include/asm/irq_stack.h:77 [inline] [<000000004a62cc2e>] do_softirq_own_stack+0x32/0x40 arch/x86/kernel/irq_64.c:77 [<00000000b0086800>] invoke_softirq kernel/softirq.c:393 [inline] [<00000000b0086800>] __irq_exit_rcu kernel/softirq.c:423 [inline] [<00000000b0086800>] irq_exit_rcu+0x91/0xc0 kernel/softirq.c:435 [<00000000175f9523>] sysvec_apic_timer_interrupt+0x36/0x80 arch/x86/kernel/apic/apic.c:1091 [<00000000a348e847>] asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:631 [<0000000060661100>] native_safe_halt arch/x86/include/asm/irqflags.h:60 [inline] [<0000000060661100>] arch_safe_halt arch/x86/include/asm/irqflags.h:103 [inline] [<0000000060661100>] acpi_safe_halt drivers/acpi/processor_idle.c:111 [inline] [<0000000060661100>] acpi_idle_do_entry+0xc3/0xd0 drivers/acpi/processor_idle.c:517 [<000000003f413b99>] acpi_idle_enter+0x128/0x1f0 drivers/acpi/processor_idle.c:648 [<00000000f5e5afb8>] cpuidle_enter_state+0xc9/0x650 drivers/cpuidle/cpuidle.c:237 [<00000000d50d51fc>] cpuidle_enter+0x29/0x40 drivers/cpuidle/cpuidle.c:351 [<00000000d674baed>] call_cpuidle kernel/sched/idle.c:132 [inline] [<00000000d674baed>] cpuidle_idle_call kernel/sched/idle.c:213 [inline] [<00000000d674baed>] do_idle+0x1c8/0x250 kernel/sched/idle.c:273 Reported-by: syzbot+bd38200f53df6259e6bf@syzkaller.appspotmail.com Signed-off-by: Zqiang Signed-off-by: Felipe Balbi --- drivers/usb/gadget/legacy/raw_gadget.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/raw_gadget.c b/drivers/usb/gadget/legacy/raw_gadget.c index e01e366d89cd..062dfac30399 100644 --- a/drivers/usb/gadget/legacy/raw_gadget.c +++ b/drivers/usb/gadget/legacy/raw_gadget.c @@ -564,9 +564,12 @@ static int raw_ioctl_event_fetch(struct raw_dev *dev, unsigned long value) return -ENODEV; } length = min(arg.length, event->length); - if (copy_to_user((void __user *)value, event, sizeof(*event) + length)) + if (copy_to_user((void __user *)value, event, sizeof(*event) + length)) { + kfree(event); return -EFAULT; + } + kfree(event); return 0; } -- cgit v1.2.3 From 343a3e8bc635bd4c58d45a4fe67f9c3a78fbd191 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 17:20:50 +0100 Subject: bpf: Fix -Wshadow warnings There are thousands of warnings about one macro in a W=2 build: include/linux/filter.h:561:6: warning: declaration of 'ret' shadows a previous local [-Wshadow] Prefix all the locals in that macro with __ to avoid most of these warnings. Fixes: 492ecee892c2 ("bpf: enable program stats") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201026162110.3710415-1-arnd@kernel.org --- include/linux/filter.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 72d62cbc1578..1b62397bd124 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -558,21 +558,21 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); #define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ - u32 ret; \ + u32 __ret; \ cant_migrate(); \ if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ - struct bpf_prog_stats *stats; \ - u64 start = sched_clock(); \ - ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - stats = this_cpu_ptr(prog->aux->stats); \ - u64_stats_update_begin(&stats->syncp); \ - stats->cnt++; \ - stats->nsecs += sched_clock() - start; \ - u64_stats_update_end(&stats->syncp); \ + struct bpf_prog_stats *__stats; \ + u64 __start = sched_clock(); \ + __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + __stats = this_cpu_ptr(prog->aux->stats); \ + u64_stats_update_begin(&__stats->syncp); \ + __stats->cnt++; \ + __stats->nsecs += sched_clock() - __start; \ + u64_stats_update_end(&__stats->syncp); \ } else { \ - ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ } \ - ret; }) + __ret; }) #define BPF_PROG_RUN(prog, ctx) \ __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func) -- cgit v1.2.3 From c66dca98a24cb5f3493dd08d40bcfa94a220fa92 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 27 Oct 2020 00:36:23 +0100 Subject: samples/bpf: Set rlimit for memlock to infinity in all samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memlock rlimit is a notorious source of failure for BPF programs. Most of the samples just set it to infinity, but a few used a lower limit. The problem with unconditionally setting a lower limit is that this will also override the limit if the system-wide setting is *higher* than the limit being set, which can lead to failures on systems that lock a lot of memory, but set 'ulimit -l' to unlimited before running a sample. One fix for this is to only conditionally set the limit if the current limit is lower, but it is simpler to just unify all the samples and have them all set the limit to infinity. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20201026233623.91728-1-toke@redhat.com --- samples/bpf/task_fd_query_user.c | 2 +- samples/bpf/tracex2_user.c | 2 +- samples/bpf/tracex3_user.c | 2 +- samples/bpf/xdp_redirect_cpu_user.c | 2 +- samples/bpf/xdp_rxq_info_user.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c index 4a74531dc403..b68bd2f8fdc9 100644 --- a/samples/bpf/task_fd_query_user.c +++ b/samples/bpf/task_fd_query_user.c @@ -290,7 +290,7 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) int main(int argc, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; extern char __executable_start; char filename[256], buf[256]; __u64 uprobe_file_offset; diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index 3e36b3e4e3ef..3d6eab711d23 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -116,7 +116,7 @@ static void int_exit(int sig) int main(int ac, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; long key, next_key, value; struct bpf_link *links[2]; struct bpf_program *prog; diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index 70e987775c15..83e0fecbb01a 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -107,7 +107,7 @@ static void print_hist(int fd) int main(int ac, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_link *links[2]; struct bpf_program *prog; struct bpf_object *obj; diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 6fb8dbde62c5..f78cb18319aa 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -765,7 +765,7 @@ static int load_cpumap_prog(char *file_name, char *prog_name, int main(int argc, char **argv) { - struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs"; char *mprog_filename = "xdp_redirect_kern.o"; char *redir_interface = NULL, *redir_map = NULL; diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index caa4e7ffcfc7..93fa1bc54f13 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -450,7 +450,7 @@ static void stats_poll(int interval, int action, __u32 cfg_opt) int main(int argc, char **argv) { __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */ - struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; -- cgit v1.2.3 From dc6bf4da825aa0301a46f55fec7c0bb706af2aad Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 26 Oct 2020 16:20:32 -0400 Subject: selftests/ftrace: Use $FUNCTION_FORK to reference kernel fork function Commit cad6967ac108 ("fork: introduce kernel_clone()") replaced "_do_fork()" with "kernel_clone()". The ftrace selftests reference the fork function in several of the tests. The rename will make the tests break, but if those names are changed in the tests, they would then break on older kernels. The same set of tests should pass older kernels if they have previously passed. Obviously, a new test may not work on older kernels if the test was added due to a bug or a new feature. The setup of ftracetest will now create a $FUNCTION_FORK bash variable that will contain "_do_fork" for older kernels and "kernel_clone" for newer ones. It figures out the proper name by examining /proc/kallsyms. Note, available_filter_functions could also be used, but because some tests should be able to pass without function tracing enabled, it could not be used. Fixes: eea11285dab3 ("tracing: switch to kernel_clone()") Signed-off-by: Steven Rostedt (VMware) Acked-by: Masami Hiramatsu Signed-off-by: Shuah Khan --- .../selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc | 2 +- .../ftrace/test.d/dynevent/clear_select_events.tc | 2 +- .../ftrace/test.d/dynevent/generic_clear_event.tc | 2 +- .../ftrace/test.d/ftrace/func-filter-notrace-pid.tc | 2 +- .../selftests/ftrace/test.d/ftrace/func-filter-pid.tc | 2 +- .../ftrace/test.d/ftrace/func-filter-stacktrace.tc | 4 ++-- tools/testing/selftests/ftrace/test.d/functions | 7 +++++++ .../selftests/ftrace/test.d/kprobe/add_and_remove.tc | 2 +- tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc | 2 +- .../testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc | 4 ++-- .../selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc | 2 +- .../selftests/ftrace/test.d/kprobe/kprobe_args_string.tc | 4 ++-- .../selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc | 10 +++++----- .../selftests/ftrace/test.d/kprobe/kprobe_args_type.tc | 2 +- .../selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc | 14 +++++++------- .../selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc | 2 +- .../selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc | 12 ++++++------ .../selftests/ftrace/test.d/kprobe/kretprobe_args.tc | 4 ++-- tools/testing/selftests/ftrace/test.d/kprobe/profile.tc | 2 +- 19 files changed, 44 insertions(+), 37 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc index 3bcd4c3624ee..b4da41d126d5 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc @@ -6,7 +6,7 @@ echo 0 > events/enable echo > dynamic_events -PLACE=kernel_clone +PLACE=$FUNCTION_FORK echo "p:myevent1 $PLACE" >> dynamic_events echo "r:myevent2 $PLACE" >> dynamic_events diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc b/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc index 438961971b7e..3a0e2885fff5 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc @@ -6,7 +6,7 @@ echo 0 > events/enable echo > dynamic_events -PLACE=kernel_clone +PLACE=$FUNCTION_FORK setup_events() { echo "p:myevent1 $PLACE" >> dynamic_events diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc b/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc index a8603bd23e0d..d3e138e8377f 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc @@ -6,7 +6,7 @@ echo 0 > events/enable echo > dynamic_events -PLACE=kernel_clone +PLACE=$FUNCTION_FORK setup_events() { echo "p:myevent1 $PLACE" >> dynamic_events diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc index acb17ce543d2..80541964b927 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc @@ -39,7 +39,7 @@ do_test() { disable_tracing echo do_execve* > set_ftrace_filter - echo *do_fork >> set_ftrace_filter + echo $FUNCTION_FORK >> set_ftrace_filter echo $PID > set_ftrace_notrace_pid echo function > current_tracer diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc index 9f0a9687c773..2f7211254529 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc @@ -39,7 +39,7 @@ do_test() { disable_tracing echo do_execve* > set_ftrace_filter - echo *do_fork >> set_ftrace_filter + echo $FUNCTION_FORK >> set_ftrace_filter echo $PID > set_ftrace_pid echo function > current_tracer diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc index 98305d76bd04..191d116b7883 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc @@ -4,9 +4,9 @@ # requires: set_ftrace_filter # flags: instance -echo kernel_clone:stacktrace >> set_ftrace_filter +echo $FUNCTION_FORK:stacktrace >> set_ftrace_filter -grep -q "kernel_clone:stacktrace:unlimited" set_ftrace_filter +grep -q "$FUNCTION_FORK:stacktrace:unlimited" set_ftrace_filter (echo "forked"; sleep 1) diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index c5dec55b7d95..a6fac927ee82 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -133,6 +133,13 @@ yield() { ping $LOCALHOST -c 1 || sleep .001 || usleep 1 || sleep 1 } +# The fork function in the kernel was renamed from "_do_fork" to +# "kernel_fork". As older tests should still work with older kernels +# as well as newer kernels, check which version of fork is used on this +# kernel so that the tests can use the fork function for the running kernel. +FUNCTION_FORK=`(if grep '\bkernel_clone\b' /proc/kallsyms > /dev/null; then + echo kernel_clone; else echo '_do_fork'; fi)` + # Since probe event command may include backslash, explicitly use printf "%s" # to NOT interpret it. ftrace_errlog_check() { # err-prefix command-with-error-pos-by-^ command-file diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc b/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc index 9737cd0578a7..2428a3ed78c9 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc @@ -3,7 +3,7 @@ # description: Kprobe dynamic event - adding and removing # requires: kprobe_events -echo p:myevent kernel_clone > kprobe_events +echo p:myevent $FUNCTION_FORK > kprobe_events grep myevent kprobe_events test -d events/kprobes/myevent echo > kprobe_events diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc b/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc index f9a40af76888..010a8b1d6c1d 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc @@ -3,7 +3,7 @@ # description: Kprobe dynamic event - busy event check # requires: kprobe_events -echo p:myevent kernel_clone > kprobe_events +echo p:myevent $FUNCTION_FORK > kprobe_events test -d events/kprobes/myevent echo 1 > events/kprobes/myevent/enable echo > kprobe_events && exit_fail # this must fail diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc index eb543d3cfe5f..a96a1dc7014f 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc @@ -3,13 +3,13 @@ # description: Kprobe dynamic event with arguments # requires: kprobe_events -echo 'p:testprobe kernel_clone $stack $stack0 +0($stack)' > kprobe_events +echo "p:testprobe $FUNCTION_FORK \$stack \$stack0 +0(\$stack)" > kprobe_events grep testprobe kprobe_events | grep -q 'arg1=\$stack arg2=\$stack0 arg3=+0(\$stack)' test -d events/kprobes/testprobe echo 1 > events/kprobes/testprobe/enable ( echo "forked") -grep testprobe trace | grep 'kernel_clone' | \ +grep testprobe trace | grep "$FUNCTION_FORK" | \ grep -q 'arg1=0x[[:xdigit:]]* arg2=0x[[:xdigit:]]* arg3=0x[[:xdigit:]]*$' echo 0 > events/kprobes/testprobe/enable diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc index 4e5b63be51c9..a053ee2e7d77 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc @@ -5,7 +5,7 @@ grep -A1 "fetcharg:" README | grep -q "\$comm" || exit_unsupported # this is too old -echo 'p:testprobe kernel_clone comm=$comm ' > kprobe_events +echo "p:testprobe $FUNCTION_FORK comm=\$comm " > kprobe_events grep testprobe kprobe_events | grep -q 'comm=$comm' test -d events/kprobes/testprobe diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc index a1d70588ab21..84285a6f60b0 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc @@ -30,13 +30,13 @@ esac : "Test get argument (1)" echo "p:testprobe tracefs_create_dir arg1=+0(${ARG1}):string" > kprobe_events echo 1 > events/kprobes/testprobe/enable -echo "p:test kernel_clone" >> kprobe_events +echo "p:test $FUNCTION_FORK" >> kprobe_events grep -qe "testprobe.* arg1=\"test\"" trace echo 0 > events/kprobes/testprobe/enable : "Test get argument (2)" echo "p:testprobe tracefs_create_dir arg1=+0(${ARG1}):string arg2=+0(${ARG1}):string" > kprobe_events echo 1 > events/kprobes/testprobe/enable -echo "p:test kernel_clone" >> kprobe_events +echo "p:test $FUNCTION_FORK" >> kprobe_events grep -qe "testprobe.* arg1=\"test\" arg2=\"test\"" trace diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc index bd25dd0ba0d0..717130ed4feb 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc @@ -14,12 +14,12 @@ elif ! grep "$SYMBOL\$" /proc/kallsyms; then fi : "Test get basic types symbol argument" -echo "p:testprobe_u kernel_clone arg1=@linux_proc_banner:u64 arg2=@linux_proc_banner:u32 arg3=@linux_proc_banner:u16 arg4=@linux_proc_banner:u8" > kprobe_events -echo "p:testprobe_s kernel_clone arg1=@linux_proc_banner:s64 arg2=@linux_proc_banner:s32 arg3=@linux_proc_banner:s16 arg4=@linux_proc_banner:s8" >> kprobe_events +echo "p:testprobe_u $FUNCTION_FORK arg1=@linux_proc_banner:u64 arg2=@linux_proc_banner:u32 arg3=@linux_proc_banner:u16 arg4=@linux_proc_banner:u8" > kprobe_events +echo "p:testprobe_s $FUNCTION_FORK arg1=@linux_proc_banner:s64 arg2=@linux_proc_banner:s32 arg3=@linux_proc_banner:s16 arg4=@linux_proc_banner:s8" >> kprobe_events if grep -q "x8/16/32/64" README; then - echo "p:testprobe_x kernel_clone arg1=@linux_proc_banner:x64 arg2=@linux_proc_banner:x32 arg3=@linux_proc_banner:x16 arg4=@linux_proc_banner:x8" >> kprobe_events + echo "p:testprobe_x $FUNCTION_FORK arg1=@linux_proc_banner:x64 arg2=@linux_proc_banner:x32 arg3=@linux_proc_banner:x16 arg4=@linux_proc_banner:x8" >> kprobe_events fi -echo "p:testprobe_bf kernel_clone arg1=@linux_proc_banner:b8@4/32" >> kprobe_events +echo "p:testprobe_bf $FUNCTION_FORK arg1=@linux_proc_banner:b8@4/32" >> kprobe_events echo 1 > events/kprobes/enable (echo "forked") echo 0 > events/kprobes/enable @@ -27,7 +27,7 @@ grep "testprobe_[usx]:.* arg1=.* arg2=.* arg3=.* arg4=.*" trace grep "testprobe_bf:.* arg1=.*" trace : "Test get string symbol argument" -echo "p:testprobe_str kernel_clone arg1=@linux_proc_banner:string" > kprobe_events +echo "p:testprobe_str $FUNCTION_FORK arg1=@linux_proc_banner:string" > kprobe_events echo 1 > events/kprobes/enable (echo "forked") echo 0 > events/kprobes/enable diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc index 91fcce1c241c..25b7708eb559 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc @@ -4,7 +4,7 @@ # requires: kprobe_events "x8/16/32/64":README gen_event() { # Bitsize - echo "p:testprobe kernel_clone \$stack0:s$1 \$stack0:u$1 \$stack0:x$1 \$stack0:b4@4/$1" + echo "p:testprobe $FUNCTION_FORK \$stack0:s$1 \$stack0:u$1 \$stack0:x$1 \$stack0:b4@4/$1" } check_types() { # s-type u-type x-type bf-type width diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc index 0d179094191f..5556292601a4 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc @@ -5,29 +5,29 @@ # prepare echo nop > current_tracer -echo kernel_clone > set_ftrace_filter -echo 'p:testprobe kernel_clone' > kprobe_events +echo $FUNCTION_FORK > set_ftrace_filter +echo "p:testprobe $FUNCTION_FORK" > kprobe_events # kprobe on / ftrace off echo 1 > events/kprobes/testprobe/enable echo > trace ( echo "forked") grep testprobe trace -! grep 'kernel_clone <-' trace +! grep "$FUNCTION_FORK <-" trace # kprobe on / ftrace on echo function > current_tracer echo > trace ( echo "forked") grep testprobe trace -grep 'kernel_clone <-' trace +grep "$FUNCTION_FORK <-" trace # kprobe off / ftrace on echo 0 > events/kprobes/testprobe/enable echo > trace ( echo "forked") ! grep testprobe trace -grep 'kernel_clone <-' trace +grep "$FUNCTION_FORK <-" trace # kprobe on / ftrace on echo 1 > events/kprobes/testprobe/enable @@ -35,11 +35,11 @@ echo function > current_tracer echo > trace ( echo "forked") grep testprobe trace -grep 'kernel_clone <-' trace +grep "$FUNCTION_FORK <-" trace # kprobe on / ftrace off echo nop > current_tracer echo > trace ( echo "forked") grep testprobe trace -! grep 'kernel_clone <-' trace +! grep "$FUNCTION_FORK <-" trace diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc index 45d90b6c763d..f0d5b7777ed7 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc @@ -4,7 +4,7 @@ # requires: kprobe_events "Create/append/":README # Choose 2 symbols for target -SYM1=kernel_clone +SYM1=$FUNCTION_FORK SYM2=do_exit EVENT_NAME=kprobes/testevent diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index 1b5550ef8a9b..fa928b431555 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -86,15 +86,15 @@ esac # multiprobe errors if grep -q "Create/append/" README && grep -q "imm-value" README; then -echo 'p:kprobes/testevent kernel_clone' > kprobe_events +echo "p:kprobes/testevent $FUNCTION_FORK" > kprobe_events check_error '^r:kprobes/testevent do_exit' # DIFF_PROBE_TYPE # Explicitly use printf "%s" to not interpret \1 -printf "%s" 'p:kprobes/testevent kernel_clone abcd=\1' > kprobe_events -check_error 'p:kprobes/testevent kernel_clone ^bcd=\1' # DIFF_ARG_TYPE -check_error 'p:kprobes/testevent kernel_clone ^abcd=\1:u8' # DIFF_ARG_TYPE -check_error 'p:kprobes/testevent kernel_clone ^abcd=\"foo"' # DIFF_ARG_TYPE -check_error '^p:kprobes/testevent kernel_clone abcd=\1' # SAME_PROBE +printf "%s" "p:kprobes/testevent $FUNCTION_FORK abcd=\\1" > kprobe_events +check_error "p:kprobes/testevent $FUNCTION_FORK ^bcd=\\1" # DIFF_ARG_TYPE +check_error "p:kprobes/testevent $FUNCTION_FORK ^abcd=\\1:u8" # DIFF_ARG_TYPE +check_error "p:kprobes/testevent $FUNCTION_FORK ^abcd=\\\"foo\"" # DIFF_ARG_TYPE +check_error "^p:kprobes/testevent $FUNCTION_FORK abcd=\\1" # SAME_PROBE fi # %return suffix errors diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc index 7ae492c204a4..197cc2afd404 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc @@ -4,14 +4,14 @@ # requires: kprobe_events # Add new kretprobe event -echo 'r:testprobe2 kernel_clone $retval' > kprobe_events +echo "r:testprobe2 $FUNCTION_FORK \$retval" > kprobe_events grep testprobe2 kprobe_events | grep -q 'arg1=\$retval' test -d events/kprobes/testprobe2 echo 1 > events/kprobes/testprobe2/enable ( echo "forked") -cat trace | grep testprobe2 | grep -q '<- kernel_clone' +cat trace | grep testprobe2 | grep -q "<- $FUNCTION_FORK" echo 0 > events/kprobes/testprobe2/enable echo '-:testprobe2' >> kprobe_events diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc b/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc index c4093fc1a773..98166fa3eb91 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc @@ -4,7 +4,7 @@ # requires: kprobe_events ! grep -q 'myevent' kprobe_profile -echo p:myevent kernel_clone > kprobe_events +echo "p:myevent $FUNCTION_FORK" > kprobe_events grep -q 'myevent[[:space:]]*0[[:space:]]*0$' kprobe_profile echo 1 > events/kprobes/myevent/enable ( echo "forked" ) -- cgit v1.2.3 From e3e40312567087fbe6880f316cb2b0e1f3d8a82c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 2 Oct 2020 14:25:01 +0100 Subject: selftests/ftrace: check for do_sys_openat2 in user-memory test More recent libc implementations are now using openat/openat2 system calls so also add do_sys_openat2 to the tracing so that the test passes on these systems because do_sys_open may not be called. Thanks to Masami Hiramatsu for the help on getting this fix to work correctly. Signed-off-by: Colin Ian King Acked-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc index a30a9c07290d..d25d01a19778 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc @@ -9,12 +9,16 @@ grep -A10 "fetcharg:" README | grep -q '\[u\]' || exit_unsupported :;: "user-memory access syntax and ustring working on user memory";: echo 'p:myevent do_sys_open path=+0($arg2):ustring path2=+u0($arg2):string' \ > kprobe_events +echo 'p:myevent2 do_sys_openat2 path=+0($arg2):ustring path2=+u0($arg2):string' \ + >> kprobe_events grep myevent kprobe_events | \ grep -q 'path=+0($arg2):ustring path2=+u0($arg2):string' echo 1 > events/kprobes/myevent/enable +echo 1 > events/kprobes/myevent2/enable echo > /dev/null echo 0 > events/kprobes/myevent/enable +echo 0 > events/kprobes/myevent2/enable grep myevent trace | grep -q 'path="/dev/null" path2="/dev/null"' -- cgit v1.2.3 From f825d3f7ed9305e7dd0a3e0a74673a4257d0cc53 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:21 +0300 Subject: selftests: filter kselftest headers from command in lib.mk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1056d3d2c97e ("selftests: enforce local header dependency in lib.mk") added header dependency to the rule, but as the rule uses $^, the headers are added to the compiler command line. This can cause unexpected precompiled header files being generated when compilation fails: $ echo { >> openat2_test.c $ make gcc -Wall -O2 -g -fsanitize=address -fsanitize=undefined openat2_test.c tools/testing/selftests/kselftest_harness.h tools/testing/selftests/kselftest.h helpers.c -o tools/testing/selftests/openat2/openat2_test openat2_test.c:313:1: error: expected identifier or ‘(’ before ‘{’ token 313 | { | ^ make: *** [../lib.mk:140: tools/testing/selftests/openat2/openat2_test] Error 1 $ file openat2_test* openat2_test: GCC precompiled header (version 014) for C openat2_test.c: C source, ASCII text Fix it by filtering out the headers, so that we'll only pass the actual *.c files in the compiler command line. Fixes: 1056d3d2c97e ("selftests: enforce local header dependency in lib.mk") Signed-off-by: Tommi Rantala Acked-by: Kees Cook Reviewed-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 30848ca36555..a5ce26d548e4 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -136,7 +136,7 @@ endif ifeq ($(OVERRIDE_TARGETS),) LOCAL_HDRS := $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h $(OUTPUT)/%:%.c $(LOCAL_HDRS) - $(LINK.c) $^ $(LDLIBS) -o $@ + $(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@ $(OUTPUT)/%.o:%.S $(COMPILE.S) $^ -o $@ -- cgit v1.2.3 From 1948172fdba5ad643529ddcd00a601c0caa913ed Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:22 +0300 Subject: selftests: pidfd: fix compilation errors due to wait.h Drop unneeded header inclusion to fix pidfd compilation errors seen in Fedora 32: In file included from pidfd_open_test.c:9: ../../../../usr/include/linux/wait.h:17:16: error: expected identifier before numeric constant 17 | #define P_ALL 0 | ^ Signed-off-by: Tommi Rantala Reviewed-by: Kees Cook Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/pidfd/pidfd_open_test.c | 1 - tools/testing/selftests/pidfd/pidfd_poll_test.c | 1 - 2 files changed, 2 deletions(-) diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c index b9fe75fc3e51..8a59438ccc78 100644 --- a/tools/testing/selftests/pidfd/pidfd_open_test.c +++ b/tools/testing/selftests/pidfd/pidfd_open_test.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/tools/testing/selftests/pidfd/pidfd_poll_test.c b/tools/testing/selftests/pidfd/pidfd_poll_test.c index 4b115444dfe9..610811275357 100644 --- a/tools/testing/selftests/pidfd/pidfd_poll_test.c +++ b/tools/testing/selftests/pidfd/pidfd_poll_test.c @@ -3,7 +3,6 @@ #define _GNU_SOURCE #include #include -#include #include #include #include -- cgit v1.2.3 From ef7086347c82c53a6c5238bd2cf31379f6acadde Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:24 +0300 Subject: selftests/harness: prettify SKIP message whitespace again Commit 9847d24af95c ("selftests/harness: Refactor XFAIL into SKIP") replaced XFAIL with SKIP in the output. Add one more space to make the output aligned and pretty again. Fixes: 9847d24af95c ("selftests/harness: Refactor XFAIL into SKIP") Signed-off-by: Tommi Rantala Acked-by: Kees Cook Reviewed-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_harness.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index f19804df244c..d2cba10acc6d 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -126,7 +126,7 @@ snprintf(_metadata->results->reason, \ sizeof(_metadata->results->reason), fmt, ##__VA_ARGS__); \ if (TH_LOG_ENABLED) { \ - fprintf(TH_LOG_STREAM, "# SKIP %s\n", \ + fprintf(TH_LOG_STREAM, "# SKIP %s\n", \ _metadata->results->reason); \ } \ _metadata->passed = 1; \ -- cgit v1.2.3 From 0b18fed98bf96ba5ac14ab7c43c8a3364cb0daf8 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:25 +0300 Subject: selftests: pidfd: use ksft_test_result_skip() when skipping test There's planned tests != run tests in pidfd_test when some test is skipped: $ ./pidfd_test TAP version 13 1..8 [...] # pidfd_send_signal signal recycled pid test: Skipping test # Planned tests != run tests (8 != 7) # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 Fix by using ksft_test_result_skip(): $ ./pidfd_test TAP version 13 1..8 [...] ok 8 # SKIP pidfd_send_signal signal recycled pid test: Unsharing pid namespace not permitted # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:1 error:0 Signed-off-by: Tommi Rantala Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/pidfd/pidfd_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index c585aaa2acd8..529eb700ac26 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -330,7 +330,7 @@ static int test_pidfd_send_signal_recycled_pid_fail(void) ksft_exit_fail_msg("%s test: Failed to recycle pid %d\n", test_name, PID_RECYCLE); case PIDFD_SKIP: - ksft_print_msg("%s test: Skipping test\n", test_name); + ksft_test_result_skip("%s test: Skipping test\n", test_name); ret = 0; break; case PIDFD_XFAIL: -- cgit v1.2.3 From b5ec9fe5be5e02e7db9e79aaa9a1ea7a3419d0b5 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:26 +0300 Subject: selftests: pidfd: skip test on kcmp() ENOSYS Skip test if kcmp() is not available, for example if kernel is compiled without CONFIG_CHECKPOINT_RESTORE=y. Signed-off-by: Tommi Rantala Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/pidfd/pidfd_getfd_test.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/pidfd/pidfd_getfd_test.c b/tools/testing/selftests/pidfd/pidfd_getfd_test.c index 7758c98be015..0930e2411dfb 100644 --- a/tools/testing/selftests/pidfd/pidfd_getfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_getfd_test.c @@ -204,7 +204,10 @@ TEST_F(child, fetch_fd) fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0); ASSERT_GE(fd, 0); - EXPECT_EQ(0, sys_kcmp(getpid(), self->pid, KCMP_FILE, fd, self->remote_fd)); + ret = sys_kcmp(getpid(), self->pid, KCMP_FILE, fd, self->remote_fd); + if (ret < 0 && errno == ENOSYS) + SKIP(return, "kcmp() syscall not supported"); + EXPECT_EQ(ret, 0); ret = fcntl(fd, F_GETFD); ASSERT_GE(ret, 0); -- cgit v1.2.3 From 90da74af349e8a476e1d357da735b8f35b56d4e6 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:27 +0300 Subject: selftests: pidfd: add CONFIG_CHECKPOINT_RESTORE=y to config kcmp syscall is used in pidfd_getfd_test.c, so add CONFIG_CHECKPOINT_RESTORE=y to config to ensure kcmp is available. Signed-off-by: Tommi Rantala Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/pidfd/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config index bb11de90c0c9..f6f2965e17af 100644 --- a/tools/testing/selftests/pidfd/config +++ b/tools/testing/selftests/pidfd/config @@ -4,3 +4,4 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_CGROUPS=y +CONFIG_CHECKPOINT_RESTORE=y -- cgit v1.2.3 From 7b9621d4593199aa0268e56081fe730b71c053e6 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:28 +0300 Subject: selftests: pidfd: drop needless linux/kcmp.h inclusion in pidfd_setns_test.c kcmp is not used in pidfd_setns_test.c, so do not include Signed-off-by: Tommi Rantala Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/pidfd/pidfd_setns_test.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index 1f085b922c6e..6e2f2cd400ca 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "pidfd.h" #include "../clone3/clone3_selftests.h" -- cgit v1.2.3 From a46b973bced1ba57420752bf38426acd9f6cbfa6 Mon Sep 17 00:00:00 2001 From: Ziyi Cao Date: Tue, 20 Oct 2020 00:08:06 +0800 Subject: USB: serial: option: add Quectel EC200T module support Add usb product id of the Quectel EC200T module. Signed-off-by: Ziyi Cao Link: https://lore.kernel.org/r/17f8a2a3-ce0f-4be7-8544-8fdf286907d0@www.fastmail.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 2a3bfd6f867e..7e879233bc0e 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -250,6 +250,7 @@ static void option_instat_callback(struct urb *urb); #define QUECTEL_PRODUCT_EP06 0x0306 #define QUECTEL_PRODUCT_EM12 0x0512 #define QUECTEL_PRODUCT_RM500Q 0x0800 +#define QUECTEL_PRODUCT_EC200T 0x6026 #define CMOTECH_VENDOR_ID 0x16d8 #define CMOTECH_PRODUCT_6001 0x6001 @@ -1117,6 +1118,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x10), .driver_info = ZLP }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200T, 0xff, 0, 0) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CMU_300) }, -- cgit v1.2.3 From 77f6ab8b7768cf5e6bdd0e72499270a0671506ee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 28 Oct 2020 16:39:49 -0400 Subject: don't dump the threads that had been already exiting when zapped. Coredump logics needs to report not only the registers of the dumping thread, but (since 2.5.43) those of other threads getting killed. Doing that might require extra state saved on the stack in asm glue at kernel entry; signal delivery logics does that (we need to be able to save sigcontext there, at the very least) and so does seccomp. That covers all callers of do_coredump(). Secondary threads get hit with SIGKILL and caught as soon as they reach exit_mm(), which normally happens in signal delivery, so those are also fine most of the time. Unfortunately, it is possible to end up with secondary zapped when it has already entered exit(2) (or, worse yet, is oopsing). In those cases we reach exit_mm() when mm->core_state is already set, but the stack contents is not what we would have in signal delivery. At least on two architectures (alpha and m68k) it leads to infoleaks - we end up with a chunk of kernel stack written into coredump, with the contents consisting of normal C stack frames of the call chain leading to exit_mm() instead of the expected copy of userland registers. In case of alpha we leak 312 bytes of stack. Other architectures (including the regset-using ones) might have similar problems - the normal user of regsets is ptrace and the state of tracee at the time of such calls is special in the same way signal delivery is. Note that had the zapper gotten to the exiting thread slightly later, it wouldn't have been included into coredump anyway - we skip the threads that have already cleared their ->mm. So let's pretend that zapper always loses the race. IOW, have exit_mm() only insert into the dumper list if we'd gotten there from handling a fatal signal[*] As the result, the callers of do_exit() that have *not* gone through get_signal() are not seen by coredump logics as secondary threads. Which excludes voluntary exit()/oopsen/traps/etc. The dumper thread itself is unaffected by that, so seccomp is fine. [*] originally I intended to add a new flag in tsk->flags, but ebiederman pointed out that PF_SIGNALED is already doing just what we need. Cc: stable@vger.kernel.org Fixes: d89f3847def4 ("[PATCH] thread-aware coredumps, 2.5.43-C3") History-tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Acked-by: "Eric W. Biederman" Signed-off-by: Al Viro --- kernel/exit.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/exit.c b/kernel/exit.c index 87a2d515de0d..1f236ed375f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -454,7 +454,10 @@ static void exit_mm(void) mmap_read_unlock(mm); self.task = current; - self.next = xchg(&core_state->dumper.next, &self); + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. -- cgit v1.2.3 From 821f5c90130d15f8f725412d714d05df3b9e0fac Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 28 Oct 2020 11:12:04 -0700 Subject: bpf: Add struct bpf_redir_neigh forward declaration to BPF helper defs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Forward-declare struct bpf_redir_neigh in bpf_helper_defs.h to avoid compiler warning about unknown structs. Fixes: ba452c9e996d ("bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20201028181204.111241-1-andrii@kernel.org --- scripts/bpf_helpers_doc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6769caae142f..31484377b8b1 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -408,6 +408,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', 'struct bpf_sock', 'struct bpf_sock_addr', 'struct bpf_sock_ops', -- cgit v1.2.3 From e5e1a4bc916d29958c3b587354293738fcb984d7 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 27 Oct 2020 13:32:01 +0100 Subject: xsk: Fix possible memory leak at socket close MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a possible memory leak at xsk socket close that is caused by the refcounting of the umem object being wrong. The reference count of the umem was decremented only after the pool had been freed. Note that if the buffer pool is destroyed, it is important that the umem is destroyed after the pool, otherwise the umem would disappear while the driver is still running. And as the buffer pool needs to be destroyed in a work queue, the umem is also (if its refcount reaches zero) destroyed after the buffer pool in that same work queue. What was missing is that the refcount also needs to be decremented when the pool is not freed and when the pool has not even been created. The first case happens when the refcount of the pool is higher than 1, i.e. it is still being used by some other socket using the same device and queue id. In this case, it is safe to decrement the refcount of the umem outside of the work queue as the umem will never be freed because the refcount of the umem is always greater than or equal to the refcount of the buffer pool. The second case is if the buffer pool has not been created yet, i.e. the socket was closed before it was bound but after the umem was created. In this case, it is safe to destroy the umem outside of the work queue, since there is no pool that can use it by definition. Fixes: 1c1efc2af158 ("xsk: Create and free buffer pool independently from umem") Reported-by: syzbot+eb71df123dc2be2c1456@syzkaller.appspotmail.com Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1603801921-2712-1-git-send-email-magnus.karlsson@gmail.com --- include/net/xsk_buff_pool.h | 2 +- net/xdp/xsk.c | 3 ++- net/xdp/xsk_buff_pool.c | 7 +++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 0140d086dc84..01755b838c74 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -86,7 +86,7 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, void xp_destroy(struct xsk_buff_pool *pool); void xp_release(struct xdp_buff_xsk *xskb); void xp_get_pool(struct xsk_buff_pool *pool); -void xp_put_pool(struct xsk_buff_pool *pool); +bool xp_put_pool(struct xsk_buff_pool *pool); void xp_clear_dev(struct xsk_buff_pool *pool); void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b71a32eeae65..cfbec3989a76 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1146,7 +1146,8 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; - xp_put_pool(xs->pool); + if (!xp_put_pool(xs->pool)) + xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); } diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 64c9e55d4d4e..8a3bf4e1318e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -251,15 +251,18 @@ void xp_get_pool(struct xsk_buff_pool *pool) refcount_inc(&pool->users); } -void xp_put_pool(struct xsk_buff_pool *pool) +bool xp_put_pool(struct xsk_buff_pool *pool) { if (!pool) - return; + return false; if (refcount_dec_and_test(&pool->users)) { INIT_WORK(&pool->work, xp_release_deferred); schedule_work(&pool->work); + return true; } + + return false; } static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool) -- cgit v1.2.3 From 1e6f5dcc1b9ec9068f5d38331cec38b35498edf5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Oct 2020 16:36:45 -0700 Subject: tools, bpftool: Avoid array index warnings. The bpf_caps array is shorter without CAP_BPF, avoid out of bounds reads if this isn't defined. Working around this avoids -Wno-array-bounds with clang. Signed-off-by: Ian Rogers Signed-off-by: Daniel Borkmann Reviewed-by: Tobias Klauser Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201027233646.3434896-1-irogers@google.com --- tools/bpf/bpftool/feature.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index a43a6f10b564..359960a8f1de 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -843,9 +843,14 @@ static int handle_perms(void) else p_err("missing %s%s%s%s%s%s%s%srequired for full feature probing; run as root or use 'unprivileged'", capability_msg(bpf_caps, 0), +#ifdef CAP_BPF capability_msg(bpf_caps, 1), capability_msg(bpf_caps, 2), - capability_msg(bpf_caps, 3)); + capability_msg(bpf_caps, 3) +#else + "", "", "", "", "", "" +#endif /* CAP_BPF */ + ); goto exit_free; } -- cgit v1.2.3 From 0698ac66e01019528f0db4191ae3aaf9978e67da Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Oct 2020 16:36:46 -0700 Subject: tools, bpftool: Remove two unused variables. Avoid an unused variable warning. Signed-off-by: Ian Rogers Signed-off-by: Daniel Borkmann Reviewed-by: Tobias Klauser Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201027233646.3434896-2-irogers@google.com --- tools/bpf/bpftool/skeleton/profiler.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/skeleton/profiler.bpf.c b/tools/bpf/bpftool/skeleton/profiler.bpf.c index 4e3512f700c0..ce5b65e07ab1 100644 --- a/tools/bpf/bpftool/skeleton/profiler.bpf.c +++ b/tools/bpf/bpftool/skeleton/profiler.bpf.c @@ -70,7 +70,7 @@ int BPF_PROG(fentry_XXX) static inline void fexit_update_maps(u32 id, struct bpf_perf_event_value *after) { - struct bpf_perf_event_value *before, diff, *accum; + struct bpf_perf_event_value *before, diff; before = bpf_map_lookup_elem(&fentry_readings, &id); /* only account samples with a valid fentry_reading */ @@ -95,7 +95,7 @@ int BPF_PROG(fexit_XXX) { struct bpf_perf_event_value readings[MAX_NUM_MATRICS]; u32 cpu = bpf_get_smp_processor_id(); - u32 i, one = 1, zero = 0; + u32 i, zero = 0; int err; u64 *count; -- cgit v1.2.3 From 2c334e12f957cd8c6bb66b4aa3f79848b7c33cab Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 26 Oct 2020 15:19:38 -0700 Subject: xfs: set xefi_discard when creating a deferred agfl free log intent item Make sure that we actually initialize xefi_discard when we're scheduling a deferred free of an AGFL block. This was (eventually) found by the UBSAN while I was banging on realtime rmap problems, but it exists in the upstream codebase. While we're at it, rearrange the structure to reduce the struct size from 64 to 56 bytes. Fixes: fcb762f5de2e ("xfs: add bmapi nodiscard flag") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_alloc.c | 1 + fs/xfs/libxfs/xfs_bmap.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 852b536551b5..15640015be9d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2467,6 +2467,7 @@ xfs_defer_agfl_block( new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); new->xefi_blockcount = 1; new->xefi_oinfo = *oinfo; + new->xefi_skip_discard = false; trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e1bd484e5548..6747e97a7949 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -52,9 +52,9 @@ struct xfs_extent_free_item { xfs_fsblock_t xefi_startblock;/* starting fs block number */ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ + bool xefi_skip_discard; struct list_head xefi_list; struct xfs_owner_info xefi_oinfo; /* extent owner */ - bool xefi_skip_discard; }; #define XFS_BMAP_MAX_NMAP 4 -- cgit v1.2.3 From af61bc1e33d2c0ec22612b46050f5b58ac56a962 Mon Sep 17 00:00:00 2001 From: Keita Suzuki Date: Tue, 27 Oct 2020 07:31:24 +0000 Subject: scsi: hpsa: Fix memory leak in hpsa_init_one() When hpsa_scsi_add_host() fails, h->lastlogicals is leaked since it is missing a free() in the error handler. Fix this by adding free() when hpsa_scsi_add_host() fails. Link: https://lore.kernel.org/r/20201027073125.14229-1-keitasuzuki.park@sslab.ics.keio.ac.jp Tested-by: Don Brace Acked-by: Don Brace Signed-off-by: Keita Suzuki Signed-off-by: Martin K. Petersen --- drivers/scsi/hpsa.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 83ce4f11a589..8df70c92911d 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -8855,7 +8855,7 @@ reinit_after_soft_reset: /* hook into SCSI subsystem */ rc = hpsa_scsi_add_host(h); if (rc) - goto clean7; /* perf, sg, cmd, irq, shost, pci, lu, aer/h */ + goto clean8; /* lastlogicals, perf, sg, cmd, irq, shost, pci, lu, aer/h */ /* Monitor the controller for firmware lockups */ h->heartbeat_sample_interval = HEARTBEAT_SAMPLE_INTERVAL; @@ -8870,6 +8870,8 @@ reinit_after_soft_reset: HPSA_EVENT_MONITOR_INTERVAL); return 0; +clean8: /* lastlogicals, perf, sg, cmd, irq, shost, pci, lu, aer/h */ + kfree(h->lastlogicals); clean7: /* perf, sg, cmd, irq, shost, pci, lu, aer/h */ hpsa_free_performant_mode(h); h->access.set_intr_mask(h, HPSA_INTR_OFF); -- cgit v1.2.3 From 080b6f40763565f65ebb9540219c71ce885cf568 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 28 Oct 2020 18:15:05 +0100 Subject: bpf: Don't rely on GCC __attribute__((optimize)) to disable GCSE Commit 3193c0836 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") introduced a __no_fgcse macro that expands to a function scope __attribute__((optimize("-fno-gcse"))), to disable a GCC specific optimization that was causing trouble on x86 builds, and was not expected to have any positive effect in the first place. However, as the GCC manual documents, __attribute__((optimize)) is not for production use, and results in all other optimization options to be forgotten for the function in question. This can cause all kinds of trouble, but in one particular reported case, it causes -fno-asynchronous-unwind-tables to be disregarded, resulting in .eh_frame info to be emitted for the function. This reverts commit 3193c0836, and instead, it disables the -fgcse optimization for the entire source file, but only when building for X86 using GCC with CONFIG_BPF_JIT_ALWAYS_ON disabled. Note that the original commit states that CONFIG_RETPOLINE=n triggers the issue, whereas CONFIG_RETPOLINE=y performs better without the optimization, so it is kept disabled in both cases. Fixes: 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") Signed-off-by: Ard Biesheuvel Signed-off-by: Alexei Starovoitov Tested-by: Geert Uytterhoeven Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/lkml/CAMuHMdUg0WJHEcq6to0-eODpXPOywLot6UD2=GFHpzoj_hCoBQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20201028171506.15682-2-ardb@kernel.org --- include/linux/compiler-gcc.h | 2 -- include/linux/compiler_types.h | 4 ---- kernel/bpf/Makefile | 6 +++++- kernel/bpf/core.c | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d1e3c6896b71..5deb37024574 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -175,5 +175,3 @@ #else #define __diag_GCC_8(s) #endif - -#define __no_fgcse __attribute__((optimize("-fno-gcse"))) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6e390d58a9f8..ac3fa37a84f9 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -247,10 +247,6 @@ struct ftrace_likely_data { #define asm_inline asm #endif -#ifndef __no_fgcse -# define __no_fgcse -#endif - /* Are two types/vars the same type (ignoring qualifiers)? */ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767..c1b9f71ee6aa 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o -CFLAGS_core.o += $(call cc-disable-warning, override-init) +ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) +# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details +cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse +endif +CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9268d77898b7..55454d2278b1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1369,7 +1369,7 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * * Decode and execute eBPF instructions. */ -static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z -- cgit v1.2.3 From f9ac7bbd6e4540dcc6df621b9c9b6eb2e26ded1d Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Thu, 29 Oct 2020 10:37:38 +0800 Subject: irqchip/sifive-plic: Fix chip_data access within a hierarchy The plic driver crashes in plic_irq_unmask() when the interrupt is within a hierarchy, as it picks the top-level chip_data instead of its local one. Using irq_data_get_irq_chip_data() instead of irq_get_chip_data() solves the issue for good. Fixes: f1ad1133b18f ("irqchip/sifive-plic: Add support for multiple PLICs") Signed-off-by: Greentime Hu [maz: rewrote commit message] Signed-off-by: Marc Zyngier Reviewed-by: Anup Patel Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20201029023738.127472-1-greentime.hu@sifive.com --- drivers/irqchip/irq-sifive-plic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index 4048657ece0a..6f432d2a5ceb 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -99,7 +99,7 @@ static inline void plic_irq_toggle(const struct cpumask *mask, struct irq_data *d, int enable) { int cpu; - struct plic_priv *priv = irq_get_chip_data(d->irq); + struct plic_priv *priv = irq_data_get_irq_chip_data(d); writel(enable, priv->regs + PRIORITY_BASE + d->hwirq * PRIORITY_PER_ID); for_each_cpu(cpu, mask) { @@ -115,7 +115,7 @@ static void plic_irq_unmask(struct irq_data *d) { struct cpumask amask; unsigned int cpu; - struct plic_priv *priv = irq_get_chip_data(d->irq); + struct plic_priv *priv = irq_data_get_irq_chip_data(d); cpumask_and(&amask, &priv->lmask, cpu_online_mask); cpu = cpumask_any_and(irq_data_get_affinity_mask(d), @@ -127,7 +127,7 @@ static void plic_irq_unmask(struct irq_data *d) static void plic_irq_mask(struct irq_data *d) { - struct plic_priv *priv = irq_get_chip_data(d->irq); + struct plic_priv *priv = irq_data_get_irq_chip_data(d); plic_irq_toggle(&priv->lmask, d, 0); } @@ -138,7 +138,7 @@ static int plic_set_affinity(struct irq_data *d, { unsigned int cpu; struct cpumask amask; - struct plic_priv *priv = irq_get_chip_data(d->irq); + struct plic_priv *priv = irq_data_get_irq_chip_data(d); cpumask_and(&amask, &priv->lmask, mask_val); -- cgit v1.2.3 From b388bdf2bac7aedac9bde5ab63eaf7646f29fc00 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 28 Oct 2020 16:39:55 +0100 Subject: irqchip/renesas-intc-irqpin: Merge irlm_bit and needs_irlm Get rid of the separate flag to indicate if the IRLM bit is present in the INTC/Interrupt Control Register 0, by considering -1 an invalid irlm_bit value. Signed-off-by: Geert Uytterhoeven Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201028153955.1736767-1-geert+renesas@glider.be --- drivers/irqchip/irq-renesas-intc-irqpin.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-renesas-intc-irqpin.c b/drivers/irqchip/irq-renesas-intc-irqpin.c index 3819185bfd02..cb7f60b3b4a9 100644 --- a/drivers/irqchip/irq-renesas-intc-irqpin.c +++ b/drivers/irqchip/irq-renesas-intc-irqpin.c @@ -71,8 +71,7 @@ struct intc_irqpin_priv { }; struct intc_irqpin_config { - unsigned int irlm_bit; - unsigned needs_irlm:1; + int irlm_bit; /* -1 if non-existent */ }; static unsigned long intc_irqpin_read32(void __iomem *iomem) @@ -349,11 +348,10 @@ static const struct irq_domain_ops intc_irqpin_irq_domain_ops = { static const struct intc_irqpin_config intc_irqpin_irlm_r8a777x = { .irlm_bit = 23, /* ICR0.IRLM0 */ - .needs_irlm = 1, }; static const struct intc_irqpin_config intc_irqpin_rmobile = { - .needs_irlm = 0, + .irlm_bit = -1, }; static const struct of_device_id intc_irqpin_dt_ids[] = { @@ -470,7 +468,7 @@ static int intc_irqpin_probe(struct platform_device *pdev) } /* configure "individual IRQ mode" where needed */ - if (config && config->needs_irlm) { + if (config && config->irlm_bit >= 0) { if (io[INTC_IRQPIN_REG_IRLM]) intc_irqpin_read_modify_write(p, INTC_IRQPIN_REG_IRLM, config->irlm_bit, 1, 1); -- cgit v1.2.3 From bb2bd7c7f3d0946acc2104db31df228d10f7b598 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 20 Oct 2020 10:32:42 +0300 Subject: dt-bindings: irqchip: ti, sci-inta: Update for unmapped event handling The new DMA architecture introduced with AM64 introduced new event types: unampped events. These events are mapped within INTA in contrast to other K3 devices where the events with similar function was originating from the UDMAP or ringacc. The ti,unmapped-event-sources should contain phandle array to the devices in the system (typically DMA controllers) from where the unmapped events originate. Signed-off-by: Peter Ujfalusi Signed-off-by: Marc Zyngier Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20201020073243.19255-2-peter.ujfalusi@ti.com --- .../devicetree/bindings/interrupt-controller/ti,sci-inta.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml index c7cd05656a3e..cc795498488f 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml @@ -32,6 +32,11 @@ description: | | | vint | bit | | 0 |.....|63| vintx | | +--------------+ +------------+ | | | + | Unmap | + | +--------------+ | + Unmapped events ----->| | umapidx |-------------------------> Globalevents + | +--------------+ | + | | +-----------------------------------------+ Configuration of these Intmap registers that maps global events to vint is @@ -70,6 +75,11 @@ properties: - description: | "limit" specifies the limit for translation + ti,unmapped-event-sources: + $ref: /schemas/types.yaml#definitions/phandle-array + description: + Array of phandles to DMA controllers where the unmapped events originate. + required: - compatible - reg -- cgit v1.2.3 From d95bdca75b3fb41bf185efe164e05aed820081a5 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 20 Oct 2020 10:32:43 +0300 Subject: irqchip/ti-sci-inta: Add support for unmapped event handling The DMA (BCDMA/PKTDMA and their rings/flows) events are under the INTA's supervision as unmapped events in AM64. In order to keep the current SW stack working, the INTA driver must replace the dev_id with it's own when a request comes for BCDMA or PKTDMA resources. Implement parsing of the optional "ti,unmapped-event-sources" phandle array to get the sci-dev-ids of the devices where the unmapped events originate. Signed-off-by: Peter Ujfalusi Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201020073243.19255-3-peter.ujfalusi@ti.com --- drivers/irqchip/irq-ti-sci-inta.c | 83 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-ti-sci-inta.c b/drivers/irqchip/irq-ti-sci-inta.c index e0cceb81c648..b2ab8db439d9 100644 --- a/drivers/irqchip/irq-ti-sci-inta.c +++ b/drivers/irqchip/irq-ti-sci-inta.c @@ -85,6 +85,17 @@ struct ti_sci_inta_vint_desc { * @base: Base address of the memory mapped IO registers * @pdev: Pointer to platform device. * @ti_sci_id: TI-SCI device identifier + * @unmapped_cnt: Number of @unmapped_dev_ids entries + * @unmapped_dev_ids: Pointer to an array of TI-SCI device identifiers of + * unmapped event sources. + * Unmapped Events are not part of the Global Event Map and + * they are converted to Global event within INTA to be + * received by the same INTA to generate an interrupt. + * In case an interrupt request comes for a device which is + * generating Unmapped Event, we must use the INTA's TI-SCI + * device identifier in place of the source device + * identifier to let sysfw know where it has to program the + * Global Event number. */ struct ti_sci_inta_irq_domain { const struct ti_sci_handle *sci; @@ -96,11 +107,37 @@ struct ti_sci_inta_irq_domain { void __iomem *base; struct platform_device *pdev; u32 ti_sci_id; + + int unmapped_cnt; + u16 *unmapped_dev_ids; }; #define to_vint_desc(e, i) container_of(e, struct ti_sci_inta_vint_desc, \ events[i]) +static u16 ti_sci_inta_get_dev_id(struct ti_sci_inta_irq_domain *inta, u32 hwirq) +{ + u16 dev_id = HWIRQ_TO_DEVID(hwirq); + int i; + + if (inta->unmapped_cnt == 0) + return dev_id; + + /* + * For devices sending Unmapped Events we must use the INTA's TI-SCI + * device identifier number to be able to convert it to a Global Event + * and map it to an interrupt. + */ + for (i = 0; i < inta->unmapped_cnt; i++) { + if (dev_id == inta->unmapped_dev_ids[i]) { + dev_id = inta->ti_sci_id; + break; + } + } + + return dev_id; +} + /** * ti_sci_inta_irq_handler() - Chained IRQ handler for the vint irqs * @desc: Pointer to irq_desc corresponding to the irq @@ -251,7 +288,7 @@ static struct ti_sci_inta_event_desc *ti_sci_inta_alloc_event(struct ti_sci_inta u16 dev_id, dev_index; int err; - dev_id = HWIRQ_TO_DEVID(hwirq); + dev_id = ti_sci_inta_get_dev_id(inta, hwirq); dev_index = HWIRQ_TO_IRQID(hwirq); event_desc = &vint_desc->events[free_bit]; @@ -352,14 +389,15 @@ static void ti_sci_inta_free_irq(struct ti_sci_inta_event_desc *event_desc, { struct ti_sci_inta_vint_desc *vint_desc; struct ti_sci_inta_irq_domain *inta; + u16 dev_id; vint_desc = to_vint_desc(event_desc, event_desc->vint_bit); inta = vint_desc->domain->host_data; + dev_id = ti_sci_inta_get_dev_id(inta, hwirq); /* free event irq */ mutex_lock(&inta->vint_mutex); inta->sci->ops.rm_irq_ops.free_event_map(inta->sci, - HWIRQ_TO_DEVID(hwirq), - HWIRQ_TO_IRQID(hwirq), + dev_id, HWIRQ_TO_IRQID(hwirq), inta->ti_sci_id, vint_desc->vint_id, event_desc->global_event, @@ -574,6 +612,41 @@ static struct msi_domain_info ti_sci_inta_msi_domain_info = { .chip = &ti_sci_inta_msi_irq_chip, }; +static int ti_sci_inta_get_unmapped_sources(struct ti_sci_inta_irq_domain *inta) +{ + struct device *dev = &inta->pdev->dev; + struct device_node *node = dev_of_node(dev); + struct of_phandle_iterator it; + int count, err, ret, i; + + count = of_count_phandle_with_args(node, "ti,unmapped-event-sources", NULL); + if (count <= 0) + return 0; + + inta->unmapped_dev_ids = devm_kcalloc(dev, count, + sizeof(*inta->unmapped_dev_ids), + GFP_KERNEL); + if (!inta->unmapped_dev_ids) + return -ENOMEM; + + i = 0; + of_for_each_phandle(&it, err, node, "ti,unmapped-event-sources", NULL, 0) { + u32 dev_id; + + ret = of_property_read_u32(it.node, "ti,sci-dev-id", &dev_id); + if (ret) { + dev_err(dev, "ti,sci-dev-id read failure for %pOFf\n", it.node); + of_node_put(it.node); + return ret; + } + inta->unmapped_dev_ids[i++] = dev_id; + } + + inta->unmapped_cnt = count; + + return 0; +} + static int ti_sci_inta_irq_domain_probe(struct platform_device *pdev) { struct irq_domain *parent_domain, *domain, *msi_domain; @@ -629,6 +702,10 @@ static int ti_sci_inta_irq_domain_probe(struct platform_device *pdev) if (IS_ERR(inta->base)) return PTR_ERR(inta->base); + ret = ti_sci_inta_get_unmapped_sources(inta); + if (ret) + return ret; + domain = irq_domain_add_linear(dev_of_node(dev), ti_sci_get_num_resources(inta->vint), &ti_sci_inta_irq_domain_ops, inta); -- cgit v1.2.3 From fd552e0542b4532483289cce48fdbd27b692984b Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 28 Oct 2020 11:27:17 -0400 Subject: powerpc/eeh_cache: Fix a possible debugfs deadlock Lockdep complains that a possible deadlock below in eeh_addr_cache_show() because it is acquiring a lock with IRQ enabled, but eeh_addr_cache_insert_dev() needs to acquire the same lock with IRQ disabled. Let's just make eeh_addr_cache_show() acquire the lock with IRQ disabled as well. CPU0 CPU1 ---- ---- lock(&pci_io_addr_cache_root.piar_lock); local_irq_disable(); lock(&tp->lock); lock(&pci_io_addr_cache_root.piar_lock); lock(&tp->lock); *** DEADLOCK *** lock_acquire+0x140/0x5f0 _raw_spin_lock_irqsave+0x64/0xb0 eeh_addr_cache_insert_dev+0x48/0x390 eeh_probe_device+0xb8/0x1a0 pnv_pcibios_bus_add_device+0x3c/0x80 pcibios_bus_add_device+0x118/0x290 pci_bus_add_device+0x28/0xe0 pci_bus_add_devices+0x54/0xb0 pcibios_init+0xc4/0x124 do_one_initcall+0xac/0x528 kernel_init_freeable+0x35c/0x3fc kernel_init+0x24/0x148 ret_from_kernel_thread+0x5c/0x80 lock_acquire+0x140/0x5f0 _raw_spin_lock+0x4c/0x70 eeh_addr_cache_show+0x38/0x110 seq_read+0x1a0/0x660 vfs_read+0xc8/0x1f0 ksys_read+0x74/0x130 system_call_exception+0xf8/0x1d0 system_call_common+0xe8/0x218 Fixes: 5ca85ae6318d ("powerpc/eeh_cache: Add a way to dump the EEH address cache") Signed-off-by: Qian Cai Reviewed-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201028152717.8967-1-cai@redhat.com --- arch/powerpc/kernel/eeh_cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 6b50bf15d8c1..bf3270426d82 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -264,8 +264,9 @@ static int eeh_addr_cache_show(struct seq_file *s, void *v) { struct pci_io_addr_range *piar; struct rb_node *n; + unsigned long flags; - spin_lock(&pci_io_addr_cache_root.piar_lock); + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); for (n = rb_first(&pci_io_addr_cache_root.rb_root); n; n = rb_next(n)) { piar = rb_entry(n, struct pci_io_addr_range, rb_node); @@ -273,7 +274,7 @@ static int eeh_addr_cache_show(struct seq_file *s, void *v) (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev)); } - spin_unlock(&pci_io_addr_cache_root.piar_lock); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); return 0; } -- cgit v1.2.3 From 99f070b62322a4b8c1252952735806d09eb44b68 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 28 Oct 2020 14:23:34 -0400 Subject: powerpc/smp: Call rcu_cpu_starting() earlier The call to rcu_cpu_starting() in start_secondary() is not early enough in the CPU-hotplug onlining process, which results in lockdep splats as follows (with CONFIG_PROVE_RCU_LIST=y): WARNING: suspicious RCU usage ----------------------------- kernel/locking/lockdep.c:3497 RCU-list traversed in non-reader section!! other info that might help us debug this: RCU used illegally from offline CPU! rcu_scheduler_active = 1, debug_locks = 1 no locks held by swapper/1/0. Call Trace: dump_stack+0xec/0x144 (unreliable) lockdep_rcu_suspicious+0x128/0x14c __lock_acquire+0x1060/0x1c60 lock_acquire+0x140/0x5f0 _raw_spin_lock_irqsave+0x64/0xb0 clockevents_register_device+0x74/0x270 register_decrementer_clockevent+0x94/0x110 start_secondary+0x134/0x800 start_secondary_prolog+0x10/0x14 This is avoided by adding a call to rcu_cpu_starting() near the beginning of the start_secondary() function. Note that the raw_smp_processor_id() is required in order to avoid calling into lockdep before RCU has declared the CPU to be watched for readers. It's safe to call rcu_cpu_starting() in the arch code as well as later in generic code, as explained by Paul: It uses a per-CPU variable so that RCU pays attention only to the first call to rcu_cpu_starting() if there is more than one of them. This is even intentional, due to there being a generic arch-independent call to rcu_cpu_starting() in notify_cpu_starting(). So multiple calls to rcu_cpu_starting() are fine by design. Fixes: 4d004099a668 ("lockdep: Fix lockdep recursion") Signed-off-by: Qian Cai Acked-by: Paul E. McKenney [mpe: Add Fixes tag, reword slightly & expand change log] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201028182334.13466-1-cai@redhat.com --- arch/powerpc/kernel/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 3c6b9822f978..8c2857cbd960 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1393,13 +1393,14 @@ static void add_cpu_to_masks(int cpu) /* Activate a secondary processor. */ void start_secondary(void *unused) { - unsigned int cpu = smp_processor_id(); + unsigned int cpu = raw_smp_processor_id(); mmgrab(&init_mm); current->active_mm = &init_mm; smp_store_cpu_info(cpu); set_dec(tb_ticks_per_jiffy); + rcu_cpu_starting(cpu); preempt_disable(); cpu_callin_map[cpu] = 1; -- cgit v1.2.3 From 92ca318e11d76562bb9448295a4fd96b6580954f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 2 Nov 2020 11:32:12 +0100 Subject: docs: ABI: sysfs-driver-dma-ioatdma: what starts with /sys This is the only file where the /sys doesn't start with a /. So, rename them to: sys -> /sys Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/f4c53fff9696a61ff0e144fee237a9527982626d.1604312590.git.mchehab+huawei@kernel.org Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/stable/sysfs-driver-dma-ioatdma | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-driver-dma-ioatdma b/Documentation/ABI/stable/sysfs-driver-dma-ioatdma index 420c1d09e42f..3a4e2cd0ddcc 100644 --- a/Documentation/ABI/stable/sysfs-driver-dma-ioatdma +++ b/Documentation/ABI/stable/sysfs-driver-dma-ioatdma @@ -1,29 +1,29 @@ -What: sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dmachan/quickdata/cap +What: /sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dmachan/quickdata/cap Date: December 3, 2009 KernelVersion: 2.6.32 Contact: dmaengine@vger.kernel.org Description: Capabilities the DMA supports.Currently there are DMA_PQ, DMA_PQ_VAL, DMA_XOR,DMA_XOR_VAL,DMA_INTERRUPT. -What: sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dmachan/quickdata/ring_active +What: /sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dmachan/quickdata/ring_active Date: December 3, 2009 KernelVersion: 2.6.32 Contact: dmaengine@vger.kernel.org Description: The number of descriptors active in the ring. -What: sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dmachan/quickdata/ring_size +What: /sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dmachan/quickdata/ring_size Date: December 3, 2009 KernelVersion: 2.6.32 Contact: dmaengine@vger.kernel.org Description: Descriptor ring size, total number of descriptors available. -What: sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dmachan/quickdata/version +What: /sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dmachan/quickdata/version Date: December 3, 2009 KernelVersion: 2.6.32 Contact: dmaengine@vger.kernel.org Description: Version of ioatdma device. -What: sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dmachan/quickdata/intr_coalesce +What: /sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dmachan/quickdata/intr_coalesce Date: August 8, 2017 KernelVersion: 4.14 Contact: dmaengine@vger.kernel.org -- cgit v1.2.3 From 1088ee2230ac5e1c889d5ba020f37c09000ee3af Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 2 Nov 2020 11:32:13 +0100 Subject: docs: ABI: sysfs-class-net: fix a typo clas->class Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/4bba2a1592df5a9435c8d4757a9abf20246e2a99.1604312590.git.mchehab+huawei@kernel.org Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-class-net | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 7670012ae9b6..1f2002df5ba2 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -152,7 +152,7 @@ Description: When an interface is under test, it cannot be expected to pass packets as normal. -What: /sys/clas/net//duplex +What: /sys/class/net//duplex Date: October 2009 KernelVersion: 2.6.33 Contact: netdev@vger.kernel.org -- cgit v1.2.3 From e186d80e2b85ab3e69de941d069ab9e11018ddf4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 2 Nov 2020 11:32:14 +0100 Subject: docs: leds: index.rst: add a missing file Changeset 26a07553041e ("docs: ABI: sysfs-class-led-trigger-pattern: remove hw_pattern duplication") didn't include the needed changes at index.rst. Fixes: 26a07553041e ("docs: ABI: sysfs-class-led-trigger-pattern: remove hw_pattern duplication") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/36a6e3aef6e57ea349f1b47c7731d4cd1e03ca77.1604312590.git.mchehab+huawei@kernel.org Signed-off-by: Greg Kroah-Hartman --- Documentation/leds/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst index 53e6090454af..e5d63b940045 100644 --- a/Documentation/leds/index.rst +++ b/Documentation/leds/index.rst @@ -25,3 +25,4 @@ LEDs leds-lp5562 leds-lp55xx leds-mlxcpld + leds-sc27xx -- cgit v1.2.3 From daaaf58a2b7fd59951bd090eddee131f26422e20 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 2 Nov 2020 11:32:15 +0100 Subject: scripts: get_abi.pl: Don't let ABI files to create subtitles The ReST output should only contain documentation titles automatically created by the script. There are two reasons for that: 1) Consistency. just a handful ABI docs define titles 2) To avoid critical errors. Docutils (which is the basis for Sphinx) allows a free assign of documentation title markups. So, one document could be doing things like: Level 1 ======= Level 2 ------- While another one could do the reverse: Level 1 ------- Level 2 ======= But the same document can't mix. As the output of get_abi.pl will join contents from multiple files, if they don't define the levels on a consistent errors, errors like this can happen: Sphinx parallel build error: docutils.utils.SystemMessage: /home/rdunlap/lnx/lnx-510-rc2/Documentation/ABI/testing/sysfs-bus-rapidio:2: (SEVERE/4) Title level inconsistent: Attributes Common for All RapidIO Devices ----------------------------------------- Which cause some versions of Sphinx to go into an endless loop. It should be noticed that an alternative to that would be to replace all title occurrences by a single markup, but that will make the parser more complex, and, due to (1) it would generate an inconsistent output. So, better to just remove the titles defined at the ABI files from the output. Reported-by: Randy Dunlap Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/6c62ef5c01d39dee8d891f8390c816d2a889670a.1604312590.git.mchehab+huawei@kernel.org Signed-off-by: Greg Kroah-Hartman --- scripts/get_abi.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/get_abi.pl b/scripts/get_abi.pl index 2cb592f8eba4..459f169f834c 100755 --- a/scripts/get_abi.pl +++ b/scripts/get_abi.pl @@ -352,6 +352,12 @@ sub output_rest { if (!($desc =~ /^\s*$/)) { if ($description_is_rst) { + # Remove title markups from the description + # Having titles inside ABI files will only work if extra + # care would be taken in order to strictly follow the same + # level order for each markup. + $desc =~ s/\n[\-\*\=\^\~]+\n/\n\n/g; + # Enrich text by creating cross-references $desc =~ s,Documentation/(?!devicetree)(\S+)\.rst,:doc:`/$1`,g; -- cgit v1.2.3 From 9d4fdda3344611ec53ededccc0c13cb149ba4375 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 2 Nov 2020 11:32:16 +0100 Subject: scripts: get_api.pl: Add sub-titles to ABI output Instead of adding titles just for the files, add titles for each part of the ABI output, in order to make easier to search for a symbol there. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/64752a5de06ab8263c296e3ed01414b25861e1eb.1604312590.git.mchehab+huawei@kernel.org Signed-off-by: Greg Kroah-Hartman --- scripts/get_abi.pl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/scripts/get_abi.pl b/scripts/get_abi.pl index 459f169f834c..68dab828a722 100755 --- a/scripts/get_abi.pl +++ b/scripts/get_abi.pl @@ -287,6 +287,8 @@ my $bondary = qr { (?{type} eq "File") cmp ($data{$b}->{type} eq "File") || $a cmp $b @@ -306,6 +308,21 @@ sub output_rest { $w =~ s/([\(\)\_\-\*\=\^\~\\])/\\$1/g; if ($type ne "File") { + my $cur_part = $what; + if ($what =~ '/') { + if ($what =~ m#^(\/?(?:[\w\-]+\/?){1,2})#) { + $cur_part = "Symbols under $1"; + $cur_part =~ s,/$,,; + } + } + + if ($cur_part ne "" && $part ne $cur_part) { + $part = $cur_part; + my $bar = $part; + $bar =~ s/./-/g; + print "$part\n$bar\n\n"; + } + printf ".. _%s:\n\n", $data{$what}->{label}; my @names = split /, /,$w; -- cgit v1.2.3 From e9696d259d0fb5d239e8c28ca41089838ea76d13 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 26 Oct 2020 17:02:14 -0700 Subject: swiotlb: fix "x86: Don't panic if can not alloc buffer for swiotlb" kernel/dma/swiotlb.c:swiotlb_init gets called first and tries to allocate a buffer for the swiotlb. It does so by calling memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE); If the allocation must fail, no_iotlb_memory is set. Later during initialization swiotlb-xen comes in (drivers/xen/swiotlb-xen.c:xen_swiotlb_init) and given that io_tlb_start is != 0, it thinks the memory is ready to use when actually it is not. When the swiotlb is actually needed, swiotlb_tbl_map_single gets called and since no_iotlb_memory is set the kernel panics. Instead, if swiotlb-xen.c:xen_swiotlb_init knew the swiotlb hadn't been initialized, it would do the initialization itself, which might still succeed. Fix the panic by setting io_tlb_start to 0 on swiotlb initialization failure, and also by setting no_iotlb_memory to false on swiotlb initialization success. Fixes: ac2cbab21f31 ("x86: Don't panic if can not alloc buffer for swiotlb") Reported-by: Elliott Mitchell Tested-by: Elliott Mitchell Signed-off-by: Stefano Stabellini Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b4eea0abc3f0..54078f0d4c87 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -229,6 +229,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; if (verbose) swiotlb_print_info(); @@ -260,9 +261,11 @@ swiotlb_init(int verbose) if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; - if (io_tlb_start) + if (io_tlb_start) { memblock_free_early(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + io_tlb_start = 0; + } pr_warn("Cannot allocate buffer"); no_iotlb_memory = true; } @@ -360,6 +363,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; swiotlb_print_info(); -- cgit v1.2.3 From fc0021aa340af65a0a37d77be39e22aa886a6132 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Oct 2020 08:33:09 +0200 Subject: swiotlb: remove the tbl_dma_addr argument to swiotlb_tbl_map_single The tbl_dma_addr argument is used to check the DMA boundary for the allocations, and thus needs to be a dma_addr_t. swiotlb-xen instead passed a physical address, which could lead to incorrect results for strange offsets. Fix this by removing the parameter entirely and hard code the DMA address for io_tlb_start instead. Fixes: 91ffe4ad534a ("swiotlb-xen: introduce phys_to_dma/dma_to_phys translations") Signed-off-by: Christoph Hellwig Reviewed-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/iommu/intel/iommu.c | 5 ++--- drivers/xen/swiotlb-xen.c | 3 +-- include/linux/swiotlb.h | 10 +++------- kernel/dma/swiotlb.c | 16 ++++++---------- 4 files changed, 12 insertions(+), 22 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8651f6d4dfa0..6b560e6f1930 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3815,9 +3815,8 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size, * page aligned, we don't need to use a bounce page. */ if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) { - tlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, aligned_size, dir, attrs); + tlb_addr = swiotlb_tbl_map_single(dev, paddr, size, + aligned_size, dir, attrs); if (tlb_addr == DMA_MAPPING_ERROR) { goto swiotlb_error; } else { diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 71ce1b7a23d1..2b385c1b4a99 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -395,8 +395,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, */ trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - map = swiotlb_tbl_map_single(dev, virt_to_phys(xen_io_tlb_start), - phys, size, size, dir, attrs); + map = swiotlb_tbl_map_single(dev, phys, size, size, dir, attrs); if (map == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 513913ff7486..3bb72266a75a 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -45,13 +45,9 @@ enum dma_sync_target { SYNC_FOR_DEVICE = 1, }; -extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t phys, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs); +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs); extern void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 54078f0d4c87..781b9dca197c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -445,14 +445,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs) +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs) { + dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); unsigned long flags; phys_addr_t tlb_addr; unsigned int nslots, stride, index, wrap; @@ -671,9 +668,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, swiotlb_force); - swiotlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, size, dir, attrs); + swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir, + attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; -- cgit v1.2.3 From 3fc2bfa365311c6ef3e4411437786a54a911d9a9 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 16 Oct 2020 12:13:00 +0200 Subject: nfsroot: Default mount option should ask for built-in NFS version Change the nfsroot default mount option to ask for NFSv2 only *if* the kernel was built with NFSv2 support. If not, default to NFSv3 or as last choice to NFSv4, depending on actual kernel config. Signed-off-by: Helge Deller Signed-off-by: Anna Schumaker --- fs/nfs/nfsroot.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 8d3278805602..fa148308822c 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -88,7 +88,13 @@ #define NFS_ROOT "/tftpboot/%s" /* Default NFSROOT mount options. */ +#if defined(CONFIG_NFS_V2) #define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096" +#elif defined(CONFIG_NFS_V3) +#define NFS_DEF_OPTIONS "vers=3,tcp,rsize=4096,wsize=4096" +#else +#define NFS_DEF_OPTIONS "vers=4,tcp,rsize=4096,wsize=4096" +#endif /* Parameters passed from the kernel command line */ static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = ""; -- cgit v1.2.3 From 38210800bf66d7302da1bb5b624ad68638da1562 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 30 Oct 2020 10:28:54 -0700 Subject: Revert "nvme-pci: remove last_sq_tail" Multiple CPUs may be mapped to the same hctx, allowing mulitple submission contexts to attempt commit_rqs(). We need to verify we're not writing the same doorbell value multiple times since that's a spec violation. Revert commit 54b2fcee1db041a83b52b51752dade6090cf952f. Link: https://bugzilla.redhat.com/show_bug.cgi?id=1878596 Reported-by: "B.L. Jones" Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index df8f3612107f..0578ff253c47 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -198,6 +198,7 @@ struct nvme_queue { u32 q_depth; u16 cq_vector; u16 sq_tail; + u16 last_sq_tail; u16 cq_head; u16 qid; u8 cq_phase; @@ -455,11 +456,24 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set) return 0; } -static inline void nvme_write_sq_db(struct nvme_queue *nvmeq) +/* + * Write sq tail if we are asked to, or if the next command would wrap. + */ +static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) { + if (!write_sq) { + u16 next_tail = nvmeq->sq_tail + 1; + + if (next_tail == nvmeq->q_depth) + next_tail = 0; + if (next_tail != nvmeq->last_sq_tail) + return; + } + if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) writel(nvmeq->sq_tail, nvmeq->q_db); + nvmeq->last_sq_tail = nvmeq->sq_tail; } /** @@ -476,8 +490,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, cmd, sizeof(*cmd)); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; - if (write_sq) - nvme_write_sq_db(nvmeq); + nvme_write_sq_db(nvmeq, write_sq); spin_unlock(&nvmeq->sq_lock); } @@ -486,7 +499,8 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) struct nvme_queue *nvmeq = hctx->driver_data; spin_lock(&nvmeq->sq_lock); - nvme_write_sq_db(nvmeq); + if (nvmeq->sq_tail != nvmeq->last_sq_tail) + nvme_write_sq_db(nvmeq, true); spin_unlock(&nvmeq->sq_lock); } @@ -1496,6 +1510,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) struct nvme_dev *dev = nvmeq->dev; nvmeq->sq_tail = 0; + nvmeq->last_sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; -- cgit v1.2.3 From 7a078d2d18801bba7bde7337a823d7342299acf7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Oct 2020 15:37:07 -0700 Subject: libbpf, hashmap: Fix undefined behavior in hash_bits If bits is 0, the case when the map is empty, then the >> is the size of the register which is undefined behavior - on x86 it is the same as a shift by 0. Fix by handling the 0 case explicitly and guarding calls to hash_bits for empty maps in hashmap__for_each_key_entry and hashmap__for_each_entry_safe. Fixes: e3b924224028 ("libbpf: add resizable non-thread safe internal hashmap") Suggested-by: Andrii Nakryiko , Signed-off-by: Ian Rogers Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201029223707.494059-1-irogers@google.com --- tools/lib/bpf/hashmap.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/hashmap.h b/tools/lib/bpf/hashmap.h index d9b385fe808c..10a4c4cd13cf 100644 --- a/tools/lib/bpf/hashmap.h +++ b/tools/lib/bpf/hashmap.h @@ -15,6 +15,9 @@ static inline size_t hash_bits(size_t h, int bits) { /* shuffle bits and return requested number of upper bits */ + if (bits == 0) + return 0; + #if (__SIZEOF_SIZE_T__ == __SIZEOF_LONG_LONG__) /* LP64 case */ return (h * 11400714819323198485llu) >> (__SIZEOF_LONG_LONG__ * 8 - bits); @@ -174,17 +177,17 @@ bool hashmap__find(const struct hashmap *map, const void *key, void **value); * @key: key to iterate entries for */ #define hashmap__for_each_key_entry(map, cur, _key) \ - for (cur = ({ size_t bkt = hash_bits(map->hash_fn((_key), map->ctx),\ - map->cap_bits); \ - map->buckets ? map->buckets[bkt] : NULL; }); \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ cur; \ cur = cur->next) \ if (map->equal_fn(cur->key, (_key), map->ctx)) #define hashmap__for_each_key_entry_safe(map, cur, tmp, _key) \ - for (cur = ({ size_t bkt = hash_bits(map->hash_fn((_key), map->ctx),\ - map->cap_bits); \ - cur = map->buckets ? map->buckets[bkt] : NULL; }); \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ cur && ({ tmp = cur->next; true; }); \ cur = tmp) \ if (map->equal_fn(cur->key, (_key), map->ctx)) -- cgit v1.2.3 From 5faf50e9e9fdc2117c61ff7e20da49cd6a29e0ca Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 24 Sep 2020 12:45:59 +0200 Subject: scsi: scsi_dh_alua: Avoid crash during alua_bus_detach() alua_bus_detach() might be running concurrently with alua_rtpg_work(), so we might trip over h->sdev == NULL and call BUG_ON(). The correct way of handling it is to not set h->sdev to NULL in alua_bus_detach(), and call rcu_synchronize() before the final delete to ensure that all concurrent threads have left the critical section. Then we can get rid of the BUG_ON() and replace it with a simple if condition. Link: https://lore.kernel.org/r/1600167537-12509-1-git-send-email-jitendra.khasdev@oracle.com Link: https://lore.kernel.org/r/20200924104559.26753-1-hare@suse.de Cc: Brian Bunker Acked-by: Brian Bunker Tested-by: Jitendra Khasdev Reviewed-by: Jitendra Khasdev Signed-off-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/device_handler/scsi_dh_alua.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index f32da0ca529e..308bda2e9c00 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -658,8 +658,8 @@ static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg) rcu_read_lock(); list_for_each_entry_rcu(h, &tmp_pg->dh_list, node) { - /* h->sdev should always be valid */ - BUG_ON(!h->sdev); + if (!h->sdev) + continue; h->sdev->access_state = desc[0]; } rcu_read_unlock(); @@ -705,7 +705,8 @@ static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg) pg->expiry = 0; rcu_read_lock(); list_for_each_entry_rcu(h, &pg->dh_list, node) { - BUG_ON(!h->sdev); + if (!h->sdev) + continue; h->sdev->access_state = (pg->state & SCSI_ACCESS_STATE_MASK); if (pg->pref) @@ -1147,7 +1148,6 @@ static void alua_bus_detach(struct scsi_device *sdev) spin_lock(&h->pg_lock); pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock)); rcu_assign_pointer(h->pg, NULL); - h->sdev = NULL; spin_unlock(&h->pg_lock); if (pg) { spin_lock_irq(&pg->lock); @@ -1156,6 +1156,7 @@ static void alua_bus_detach(struct scsi_device *sdev) kref_put(&pg->kref, release_port_group); } sdev->handler_data = NULL; + synchronize_rcu(); kfree(h); } -- cgit v1.2.3 From 5feed64f9199ff90c4239971733f23f30aeb2484 Mon Sep 17 00:00:00 2001 From: Sreekanth Reddy Date: Mon, 2 Nov 2020 12:57:46 +0530 Subject: scsi: mpt3sas: Fix timeouts observed while reenabling IRQ While reenabling the IRQ after irq poll there may be small time window where HBA firmware has posted some replies and raise the interrupts but driver has not received the interrupts. So we may observe I/O timeouts as the driver has not processed the replies as interrupts got missed while reenabling the IRQ. To fix this issue the driver has to go for one more round of processing the reply descriptors from reply descriptor post queue after enabling the IRQ. Link: https://lore.kernel.org/r/20201102072746.27410-1-sreekanth.reddy@broadcom.com Reported-by: Tomas Henzl Reviewed-by: Tomas Henzl Signed-off-by: Sreekanth Reddy Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 93230cd1982f..e4cc92bc4d94 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -1740,6 +1740,13 @@ _base_irqpoll(struct irq_poll *irqpoll, int budget) reply_q->irq_poll_scheduled = false; reply_q->irq_line_enable = true; enable_irq(reply_q->os_irq); + /* + * Go for one more round of processing the + * reply descriptor post queue incase if HBA + * Firmware has posted some reply descriptors + * while reenabling the IRQ. + */ + _base_process_reply_queue(reply_q); } return num_entries; -- cgit v1.2.3 From afaa2e745a246c5ab95103a65b1ed00101e1bc63 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 2 Nov 2020 09:58:21 -0500 Subject: USB: Add NO_LPM quirk for Kingston flash drive In Bugzilla #208257, Julien Humbert reports that a 32-GB Kingston flash drive spontaneously disconnects and reconnects, over and over. Testing revealed that disabling Link Power Management for the drive fixed the problem. This patch adds a quirk entry for that drive to turn off LPM permanently. CC: Hans de Goede CC: Reported-and-tested-by: Julien Humbert Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/20201102145821.GA1478741@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 10574fa3f927..a1e3a037a289 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -378,6 +378,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x0926, 0x3333), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, + /* Kingston DataTraveler 3.0 */ + { USB_DEVICE(0x0951, 0x1666), .driver_info = USB_QUIRK_NO_LPM }, + /* X-Rite/Gretag-Macbeth Eye-One Pro display colorimeter */ { USB_DEVICE(0x0971, 0x2000), .driver_info = USB_QUIRK_NO_SET_INTF }, -- cgit v1.2.3 From 04800fbff4764ab7b32c49d19628605a5d4cb85c Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 22 Oct 2020 10:15:00 +0800 Subject: nvme: introduce nvme_sync_io_queues Introduce sync io queues for some scenarios which just only need sync io queues not sync all queues. Signed-off-by: Chao Leng Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 ++++++-- drivers/nvme/host/nvme.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 376096bfc54a..40ca71b29bb9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4582,8 +4582,7 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_queues); - -void nvme_sync_queues(struct nvme_ctrl *ctrl) +void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; @@ -4591,7 +4590,12 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) list_for_each_entry(ns, &ctrl->namespaces, list) blk_sync_queue(ns->queue); up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_sync_io_queues); +void nvme_sync_queues(struct nvme_ctrl *ctrl) +{ + nvme_sync_io_queues(ctrl); if (ctrl->admin_q) blk_sync_queue(ctrl->admin_q); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cc111136a981..bc330bf0d3bd 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -602,6 +602,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); +void nvme_sync_io_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); -- cgit v1.2.3 From 3017013dcc82a4862bd1e140f8b762cfc594008d Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 22 Oct 2020 10:15:08 +0800 Subject: nvme-rdma: avoid race between time out and tear down Now use teardown_lock to serialize for time out and tear down. This may cause abnormal: first cancel all request in tear down, then time out may complete the request again, but the request may already be freed or restarted. To avoid race between time out and tear down, in tear down process, first we quiesce the queue, and then delete the timer and cancel the time out work for the queue. At the same time we need to delete teardown_lock. Signed-off-by: Chao Leng Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 40a0a3b6476c..83dbf2223125 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -122,7 +122,6 @@ struct nvme_rdma_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; - struct mutex teardown_lock; bool use_inline_data; u32 io_queues[HCTX_MAX_TYPES]; }; @@ -1010,8 +1009,8 @@ out_free_io_queues: static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - mutex_lock(&ctrl->teardown_lock); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); if (ctrl->ctrl.admin_tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset, @@ -1021,16 +1020,15 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, if (remove) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); - mutex_unlock(&ctrl->teardown_lock); } static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { - mutex_lock(&ctrl->teardown_lock); if (ctrl->ctrl.queue_count > 1) { nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); if (ctrl->ctrl.tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.tagset, @@ -1041,7 +1039,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove); } - mutex_unlock(&ctrl->teardown_lock); } static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) @@ -1976,16 +1973,12 @@ static void nvme_rdma_complete_timed_out(struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_queue *queue = req->queue; - struct nvme_rdma_ctrl *ctrl = queue->ctrl; - /* fence other contexts that may complete the command */ - mutex_lock(&ctrl->teardown_lock); nvme_rdma_stop_queue(queue); if (!blk_mq_request_completed(rq)) { nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; blk_mq_complete_request(rq); } - mutex_unlock(&ctrl->teardown_lock); } static enum blk_eh_timer_return @@ -2320,7 +2313,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, return ERR_PTR(-ENOMEM); ctrl->ctrl.opts = opts; INIT_LIST_HEAD(&ctrl->list); - mutex_init(&ctrl->teardown_lock); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = -- cgit v1.2.3 From d6f66210f4b1aa2f5944f0e34e0f8db44f499f92 Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 22 Oct 2020 10:15:15 +0800 Subject: nvme-tcp: avoid race between time out and tear down Now use teardown_lock to serialize for time out and tear down. This may cause abnormal: first cancel all request in tear down, then time out may complete the request again, but the request may already be freed or restarted. To avoid race between time out and tear down, in tear down process, first we quiesce the queue, and then delete the timer and cancel the time out work for the queue. At the same time we need to delete teardown_lock. Signed-off-by: Chao Leng Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d6a3e1487354..19f86ea547bb 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -124,7 +124,6 @@ struct nvme_tcp_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; - struct mutex teardown_lock; struct work_struct err_work; struct delayed_work connect_work; struct nvme_tcp_request async_req; @@ -1886,8 +1885,8 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { - mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); blk_mq_quiesce_queue(ctrl->admin_q); + blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); if (ctrl->admin_tagset) { blk_mq_tagset_busy_iter(ctrl->admin_tagset, @@ -1897,18 +1896,17 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, if (remove) blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); - mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, bool remove) { - mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); if (ctrl->queue_count <= 1) - goto out; + return; blk_mq_quiesce_queue(ctrl->admin_q); nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); + nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); if (ctrl->tagset) { blk_mq_tagset_busy_iter(ctrl->tagset, @@ -1918,8 +1916,6 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, if (remove) nvme_start_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); -out: - mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) @@ -2171,14 +2167,11 @@ static void nvme_tcp_complete_timed_out(struct request *rq) struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; - /* fence other contexts that may complete the command */ - mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); if (!blk_mq_request_completed(rq)) { nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; blk_mq_complete_request(rq); } - mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static enum blk_eh_timer_return @@ -2455,7 +2448,6 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, nvme_tcp_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); - mutex_init(&ctrl->teardown_lock); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = -- cgit v1.2.3 From fdf58e02adecbef4c7cbb2073d8ea225e6fd5f26 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 22 Oct 2020 10:15:23 +0800 Subject: nvme-rdma: avoid repeated request completion The request may be executed asynchronously, and rq->state may be changed to IDLE. To avoid repeated request completion, only MQ_RQ_COMPLETE of rq->state is checked in nvme_rdma_complete_timed_out. It is not safe, so need adding check IDLE for rq->state. Signed-off-by: Sagi Grimberg Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 83dbf2223125..75d071d34319 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1975,7 +1975,7 @@ static void nvme_rdma_complete_timed_out(struct request *rq) struct nvme_rdma_queue *queue = req->queue; nvme_rdma_stop_queue(queue); - if (!blk_mq_request_completed(rq)) { + if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; blk_mq_complete_request(rq); } -- cgit v1.2.3 From 0a8a2c85b83589a5c10bc5564b796836bf4b4984 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 22 Oct 2020 10:15:31 +0800 Subject: nvme-tcp: avoid repeated request completion The request may be executed asynchronously, and rq->state may be changed to IDLE. To avoid repeated request completion, only MQ_RQ_COMPLETE of rq->state is checked in nvme_tcp_complete_timed_out. It is not safe, so need adding check IDLE for rq->state. Signed-off-by: Sagi Grimberg Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 19f86ea547bb..c0c33320fe65 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2168,7 +2168,7 @@ static void nvme_tcp_complete_timed_out(struct request *rq) struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); - if (!blk_mq_request_completed(rq)) { + if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; blk_mq_complete_request(rq); } -- cgit v1.2.3 From 82768a86c64659c7181571ebfbc41ec9f2e52dde Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 3 Nov 2020 15:50:04 +0200 Subject: dt-bindings: irqchip: ti, sci-inta: Fix diagram indentation for unmapped events One space has been missing by the diagram update. Fixes: bb2bd7c7f3d0 ("dt-bindings: irqchip: ti, sci-inta: Update for unmapped event handling") Reported-by: Rob Herring Signed-off-by: Peter Ujfalusi Signed-off-by: Thomas Gleixner Acked-by: Rob Herring Cc: Marc Zyngier Link: https://lore.kernel.org/r/20201103135004.2363-1-peter.ujfalusi@ti.com --- Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml index cc795498488f..8d90bc593c6e 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml @@ -34,7 +34,7 @@ description: | | | | Unmap | | +--------------+ | - Unmapped events ----->| | umapidx |-------------------------> Globalevents + Unmapped events ---->| | umapidx |-------------------------> Globalevents | +--------------+ | | | +-----------------------------------------+ -- cgit v1.2.3 From aafced673c06b7c77040c1df42e2e965be5d0376 Mon Sep 17 00:00:00 2001 From: Qii Wang Date: Fri, 30 Oct 2020 19:58:01 +0800 Subject: i2c: mediatek: move dma reset before i2c reset The i2c driver default do dma reset after i2c reset, but sometimes i2c reset will trigger dma tx2rx, then apdma write data to dram which has been i2c_put_dma_safe_msg_buf(kfree). Move dma reset before i2c reset in mtk_i2c_init_hw to fix it. Signed-off-by: Qii Wang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mt65xx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index 0cbdfbe605b5..33de99b7bc20 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -475,6 +475,10 @@ static void mtk_i2c_init_hw(struct mtk_i2c *i2c) { u16 control_reg; + writel(I2C_DMA_HARD_RST, i2c->pdmabase + OFFSET_RST); + udelay(50); + writel(I2C_DMA_CLR_FLAG, i2c->pdmabase + OFFSET_RST); + mtk_i2c_writew(i2c, I2C_SOFT_RST, OFFSET_SOFTRESET); /* Set ioconfig */ @@ -529,10 +533,6 @@ static void mtk_i2c_init_hw(struct mtk_i2c *i2c) mtk_i2c_writew(i2c, control_reg, OFFSET_CONTROL); mtk_i2c_writew(i2c, I2C_DELAY_LEN, OFFSET_DELAY_LEN); - - writel(I2C_DMA_HARD_RST, i2c->pdmabase + OFFSET_RST); - udelay(50); - writel(I2C_DMA_CLR_FLAG, i2c->pdmabase + OFFSET_RST); } static const struct i2c_spec_values *mtk_i2c_get_spec(unsigned int speed) -- cgit v1.2.3 From d3938ee23e97bfcac2e0eb6b356875da73d700df Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sun, 1 Nov 2020 03:51:02 +0800 Subject: erofs: derive atime instead of leaving it empty EROFS has _only one_ ondisk timestamp (ctime is currently documented and recorded, we might also record mtime instead with a new compat feature if needed) for each extended inode since EROFS isn't mainly for archival purposes so no need to keep all timestamps on disk especially for Android scenarios due to security concerns. Also, romfs/cramfs don't have their own on-disk timestamp, and squashfs only records mtime instead. Let's also derive access time from ondisk timestamp rather than leaving it empty, and if mtime/atime for each file are really needed for specific scenarios as well, we can also use xattrs to record them then. Link: https://lore.kernel.org/r/20201031195102.21221-1-hsiangkao@aol.com [ Gao Xiang: It'd be better to backport for user-friendly concern. ] Fixes: 431339ba9042 ("staging: erofs: add inode operations") Cc: stable # 4.19+ Reported-by: nl6720 Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 139d0bed42f8..3e21c0e8adae 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -107,11 +107,9 @@ static struct page *erofs_read_inode(struct inode *inode, i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - /* ns timestamp */ - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = - le64_to_cpu(die->i_ctime); - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = - le32_to_cpu(die->i_ctime_nsec); + /* extended inode has its own timestamp */ + inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec); inode->i_size = le64_to_cpu(die->i_size); @@ -149,11 +147,9 @@ static struct page *erofs_read_inode(struct inode *inode, i_gid_write(inode, le16_to_cpu(dic->i_gid)); set_nlink(inode, le16_to_cpu(dic->i_nlink)); - /* use build time to derive all file time */ - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = - sbi->build_time; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = - sbi->build_time_nsec; + /* use build time for compact inodes */ + inode->i_ctime.tv_sec = sbi->build_time; + inode->i_ctime.tv_nsec = sbi->build_time_nsec; inode->i_size = le32_to_cpu(dic->i_size); if (erofs_inode_is_data_compressed(vi->datalayout)) @@ -167,6 +163,11 @@ static struct page *erofs_read_inode(struct inode *inode, goto err_out; } + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; + inode->i_atime.tv_sec = inode->i_ctime.tv_sec; + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; + inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; + if (!nblks) /* measure inode.i_blocks as generic filesystems */ inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; -- cgit v1.2.3 From a30573b3cdc77b8533d004ece1ea7c0146b437a0 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 22 Oct 2020 22:57:21 +0800 Subject: erofs: fix setting up pcluster for temporary pages pcluster should be only set up for all managed pages instead of temporary pages. Since it currently uses page->mapping to identify, the impact is minor for now. [ Update: Vladimir reported the kernel log becomes polluted because PAGE_FLAGS_CHECK_AT_FREE flag(s) set if the page allocation debug option is enabled. ] Link: https://lore.kernel.org/r/20201022145724.27284-1-hsiangkao@aol.com Fixes: 5ddcee1f3a1c ("erofs: get rid of __stagingpage_alloc helper") Cc: # 5.5+ Tested-by: Vladimir Zapolskiy Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 50912a5420b4..86fd3bf62af6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1078,8 +1078,11 @@ out_allocpage: cond_resched(); goto repeat; } - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); + + if (tocache) { + set_page_private(page, (unsigned long)pcl); + SetPagePrivate(page); + } out: /* the only exit (for tracing and debugging) */ return page; } -- cgit v1.2.3 From 985616f0457d9f555fff417d0da56174f70cc14f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 26 Oct 2020 09:25:48 +0100 Subject: USB: serial: cyberjack: fix write-URB completion race The write-URB busy flag was being cleared before the completion handler was done with the URB, something which could lead to corrupt transfers due to a racing write request if the URB is resubmitted. Fixes: 507ca9bc0476 ("[PATCH] USB: add ability for usb-serial drivers to determine if their write urb is currently being used.") Cc: stable # 2.6.13 Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold --- drivers/usb/serial/cyberjack.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c index 821970609695..2e40908963da 100644 --- a/drivers/usb/serial/cyberjack.c +++ b/drivers/usb/serial/cyberjack.c @@ -357,11 +357,12 @@ static void cyberjack_write_bulk_callback(struct urb *urb) struct device *dev = &port->dev; int status = urb->status; unsigned long flags; + bool resubmitted = false; - set_bit(0, &port->write_urbs_free); if (status) { dev_dbg(dev, "%s - nonzero write bulk status received: %d\n", __func__, status); + set_bit(0, &port->write_urbs_free); return; } @@ -394,6 +395,8 @@ static void cyberjack_write_bulk_callback(struct urb *urb) goto exit; } + resubmitted = true; + dev_dbg(dev, "%s - priv->wrsent=%d\n", __func__, priv->wrsent); dev_dbg(dev, "%s - priv->wrfilled=%d\n", __func__, priv->wrfilled); @@ -410,6 +413,8 @@ static void cyberjack_write_bulk_callback(struct urb *urb) exit: spin_unlock_irqrestore(&priv->lock, flags); + if (!resubmitted) + set_bit(0, &port->write_urbs_free); usb_serial_port_softint(port); } -- cgit v1.2.3 From 489979b4aab490b6b917c11dc02d81b4b742784a Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Sat, 31 Oct 2020 23:54:58 +0100 Subject: USB: serial: option: add LE910Cx compositions 0x1203, 0x1230, 0x1231 Add following Telit LE910Cx compositions: 0x1203: rndis, tty, adb, tty, tty, tty, tty 0x1230: tty, adb, rmnet, audio, tty, tty, tty, tty 0x1231: rndis, tty, adb, audio, tty, tty, tty, tty Signed-off-by: Daniele Palmas Link: https://lore.kernel.org/r/20201031225458.10512-1-dnlplm@gmail.com [ johan: add comments after entries ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 7e879233bc0e..284d086db7a2 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1203,6 +1203,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1203, 0xff), /* Telit LE910Cx (RNDIS) */ + .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920), @@ -1217,6 +1219,10 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1213, 0xff) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1214), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1230, 0xff), /* Telit LE910Cx (rmnet) */ + .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1231, 0xff), /* Telit LE910Cx (RNDIS) */ + .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, 0x1260), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, { USB_DEVICE(TELIT_VENDOR_ID, 0x1261), -- cgit v1.2.3 From d181bfe36715a1834958cf2d62253b624adfae51 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 3 Nov 2020 09:34:08 +0100 Subject: Documentation: remove mic/index from misc-devices/index.rst With the recent removal of the misc/mic/ directory, the documentation build now warns because we forgot about this index file. Fix that up so that there are no more warnings here. Reported-by: Stephen Rothwell Cc: Sudeep Dutt Link: https://lore.kernel.org/r/20201103083408.GA2511903@kroah.com Signed-off-by: Greg Kroah-Hartman --- Documentation/misc-devices/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/misc-devices/index.rst b/Documentation/misc-devices/index.rst index 46072ce3d7ef..64420b3314fe 100644 --- a/Documentation/misc-devices/index.rst +++ b/Documentation/misc-devices/index.rst @@ -24,7 +24,6 @@ fit into other categories. isl29003 lis3lv02d max6875 - mic/index pci-endpoint-test spear-pcie-gadget uacce -- cgit v1.2.3 From 4d6ffa27b8e5116c0abb318790fd01d4e12d75e6 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 2 Nov 2020 17:23:58 -0800 Subject: x86/lib: Change .weak to SYM_FUNC_START_WEAK for arch/x86/lib/mem*_64.S Commit 393f203f5fd5 ("x86_64: kasan: add interceptors for memset/memmove/memcpy functions") added .weak directives to arch/x86/lib/mem*_64.S instead of changing the existing ENTRY macros to WEAK. This can lead to the assembly snippet .weak memcpy ... .globl memcpy which will produce a STB_WEAK memcpy with GNU as but STB_GLOBAL memcpy with LLVM's integrated assembler before LLVM 12. LLVM 12 (since https://reviews.llvm.org/D90108) will error on such an overridden symbol binding. Commit ef1e03152cb0 ("x86/asm: Make some functions local") changed ENTRY in arch/x86/lib/memcpy_64.S to SYM_FUNC_START_LOCAL, which was ineffective due to the preceding .weak directive. Use the appropriate SYM_FUNC_START_WEAK instead. Fixes: 393f203f5fd5 ("x86_64: kasan: add interceptors for memset/memmove/memcpy functions") Fixes: ef1e03152cb0 ("x86/asm: Make some functions local") Reported-by: Sami Tolvanen Signed-off-by: Fangrui Song Signed-off-by: Borislav Petkov Reviewed-by: Nick Desaulniers Tested-by: Nathan Chancellor Tested-by: Nick Desaulniers Cc: Link: https://lkml.kernel.org/r/20201103012358.168682-1-maskray@google.com --- arch/x86/lib/memcpy_64.S | 4 +--- arch/x86/lib/memmove_64.S | 4 +--- arch/x86/lib/memset_64.S | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 037faac46b0c..1e299ac73c86 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -16,8 +16,6 @@ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. */ -.weak memcpy - /* * memcpy - Copy a memory block. * @@ -30,7 +28,7 @@ * rax original destination */ SYM_FUNC_START_ALIAS(__memcpy) -SYM_FUNC_START_LOCAL(memcpy) +SYM_FUNC_START_WEAK(memcpy) ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ "jmp memcpy_erms", X86_FEATURE_ERMS diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 7ff00ea64e4f..41902fe8b859 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -24,9 +24,7 @@ * Output: * rax: dest */ -.weak memmove - -SYM_FUNC_START_ALIAS(memmove) +SYM_FUNC_START_WEAK(memmove) SYM_FUNC_START(__memmove) mov %rdi, %rax diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 9ff15ee404a4..0bfd26e4ca9e 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -6,8 +6,6 @@ #include #include -.weak memset - /* * ISO C memset - set a memory block to a byte value. This function uses fast * string to get better performance than the original function. The code is @@ -19,7 +17,7 @@ * * rax original destination */ -SYM_FUNC_START_ALIAS(memset) +SYM_FUNC_START_WEAK(memset) SYM_FUNC_START(__memset) /* * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended -- cgit v1.2.3 From 869ae85dae64b5540e4362d7fe4cd520e10ec05c Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 29 Oct 2020 14:30:48 -0700 Subject: xfs: flush new eof page on truncate to avoid post-eof corruption It is possible to expose non-zeroed post-EOF data in XFS if the new EOF page is dirty, backed by an unwritten block and the truncate happens to race with writeback. iomap_truncate_page() will not zero the post-EOF portion of the page if the underlying block is unwritten. The subsequent call to truncate_setsize() will, but doesn't dirty the page. Therefore, if writeback happens to complete after iomap_truncate_page() (so it still sees the unwritten block) but before truncate_setsize(), the cached page becomes inconsistent with the on-disk block. A mapped read after the associated page is reclaimed or invalidated exposes non-zero post-EOF data. For example, consider the following sequence when run on a kernel modified to explicitly flush the new EOF page within the race window: $ xfs_io -fc "falloc 0 4k" -c fsync /mnt/file $ xfs_io -c "pwrite 0 4k" -c "truncate 1k" /mnt/file ... $ xfs_io -c "mmap 0 4k" -c "mread -v 1k 8" /mnt/file 00000400: 00 00 00 00 00 00 00 00 ........ $ umount /mnt/; mount /mnt/ $ xfs_io -c "mmap 0 4k" -c "mread -v 1k 8" /mnt/file 00000400: cd cd cd cd cd cd cd cd ........ Update xfs_setattr_size() to explicitly flush the new EOF page prior to the page truncate to ensure iomap has the latest state of the underlying block. Fixes: 68a9f5e7007c ("xfs: implement iomap based buffered write path") Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iops.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 5e165456da68..1414ab79eacf 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -911,6 +911,16 @@ xfs_setattr_size( error = iomap_zero_range(inode, oldsize, newsize - oldsize, &did_zeroing, &xfs_buffered_write_iomap_ops); } else { + /* + * iomap won't detect a dirty page over an unwritten block (or a + * cow block over a hole) and subsequently skips zeroing the + * newly post-EOF portion of the page. Flush the new EOF to + * convert the block before the pagecache truncate. + */ + error = filemap_write_and_wait_range(inode->i_mapping, newsize, + newsize); + if (error) + return error; error = iomap_truncate_page(inode, newsize, &did_zeroing, &xfs_buffered_write_iomap_ops); } -- cgit v1.2.3 From 763e4cdc0f6d5cea45c896fef67f7be4bdefcca7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 29 Oct 2020 14:30:48 -0700 Subject: iomap: support partial page discard on writeback block mapping failure iomap writeback mapping failure only calls into ->discard_page() if the current page has not been added to the ioend. Accordingly, the XFS callback assumes a full page discard and invalidation. This is problematic for sub-page block size filesystems where some portion of a page might have been mapped successfully before a failure to map a delalloc block occurs. ->discard_page() is not called in that error scenario and the bio is explicitly failed by iomap via the error return from ->prepare_ioend(). As a result, the filesystem leaks delalloc blocks and corrupts the filesystem block counters. Since XFS is the only user of ->discard_page(), tweak the semantics to invoke the callback unconditionally on mapping errors and provide the file offset that failed to map. Update xfs_discard_page() to discard the corresponding portion of the file and pass the range along to iomap_invalidatepage(). The latter already properly handles both full and sub-page scenarios by not changing any iomap or page state on sub-page invalidations. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 15 ++++++++------- fs/xfs/xfs_aops.c | 14 ++++++++------ include/linux/iomap.h | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8180061b9e16..e4ea1f9f94d0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1382,14 +1382,15 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * appropriately. */ if (unlikely(error)) { + /* + * Let the filesystem know what portion of the current page + * failed to map. If the page wasn't been added to ioend, it + * won't be affected by I/O completion and we must unlock it + * now. + */ + if (wpc->ops->discard_page) + wpc->ops->discard_page(page, file_offset); if (!count) { - /* - * If the current page hasn't been added to ioend, it - * won't be affected by I/O completions and we must - * discard and unlock it right here. - */ - if (wpc->ops->discard_page) - wpc->ops->discard_page(page); ClearPageUptodate(page); unlock_page(page); goto done; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 55d126d4e096..5bf37afae5e9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -527,13 +527,15 @@ xfs_prepare_ioend( */ static void xfs_discard_page( - struct page *page) + struct page *page, + loff_t fileoff) { struct inode *inode = page->mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - loff_t offset = page_offset(page); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); + unsigned int pageoff = offset_in_page(fileoff); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff); + xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff); int error; if (XFS_FORCED_SHUTDOWN(mp)) @@ -541,14 +543,14 @@ xfs_discard_page( xfs_alert_ratelimited(mp, "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", - page, ip->i_ino, offset); + page, ip->i_ino, fileoff); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_page(inode, page)); + i_blocks_per_page(inode, page) - pageoff_fsb); if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: - iomap_invalidatepage(page, 0, PAGE_SIZE); + iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff); } static const struct iomap_writeback_ops xfs_writeback_ops = { diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 172b3397a1a3..5bd3cac4df9c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -221,7 +221,7 @@ struct iomap_writeback_ops { * Optional, allows the file system to discard state on a page where * we failed to submit any I/O. */ - void (*discard_page)(struct page *page); + void (*discard_page)(struct page *page, loff_t fileoff); }; struct iomap_writepage_ctx { -- cgit v1.2.3 From 50e7d6c7a5210063b9a6f0d8799d9d1440907fcf Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 29 Oct 2020 14:30:49 -0700 Subject: iomap: clean up writeback state logic on writepage error The iomap writepage error handling logic is a mash of old and slightly broken XFS writepage logic. When keepwrite writeback state tracking was introduced in XFS in commit 0d085a529b42 ("xfs: ensure WB_SYNC_ALL writeback handles partial pages correctly"), XFS had an additional cluster writeback context that scanned ahead of ->writepage() to process dirty pages over the current ->writepage() extent mapping. This context expected a dirty page and required retention of the TOWRITE tag on partial page processing so the higher level writeback context would revisit the page (in contrast to ->writepage(), which passes a page with the dirty bit already cleared). The cluster writeback mechanism was eventually removed and some of the error handling logic folded into the primary writeback path in commit 150d5be09ce4 ("xfs: remove xfs_cancel_ioend"). This patch accidentally conflated the two contexts by using the keepwrite logic in ->writepage() without accounting for the fact that the page is not dirty. Further, the keepwrite logic has no practical effect on the core ->writepage() caller (write_cache_pages()) because it never revisits a page in the current function invocation. Technically, the page should be redirtied for the keepwrite logic to have any effect. Otherwise, write_cache_pages() may find the tagged page but will skip it since it is clean. Even if the page was redirtied, however, there is still no practical effect to keepwrite since write_cache_pages() does not wrap around within a single invocation of the function. Therefore, the dirty page would simply end up retagged on the next writeback sequence over the associated range. All that being said, none of this really matters because redirtying a partially processed page introduces a potential infinite redirty -> writeback failure loop that deviates from the current design principle of clearing the dirty state on writepage failure to avoid building up too much dirty, unreclaimable memory on the system. Therefore, drop the spurious keepwrite usage and dirty state clearing logic from iomap_writepage_map(), treat the partially processed page the same as a fully processed page, and let the imminent ioend failure clean up the writeback state. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index e4ea1f9f94d0..10cc7979ce38 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1374,6 +1374,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); WARN_ON_ONCE(!PageLocked(page)); WARN_ON_ONCE(PageWriteback(page)); + WARN_ON_ONCE(PageDirty(page)); /* * We cannot cancel the ioend directly here on error. We may have @@ -1395,21 +1396,9 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, unlock_page(page); goto done; } - - /* - * If the page was not fully cleaned, we need to ensure that the - * higher layers come back to it correctly. That means we need - * to keep the page dirty, and for WB_SYNC_ALL writeback we need - * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed - * so another attempt to write this page in this writeback sweep - * will be made. - */ - set_page_writeback_keepwrite(page); - } else { - clear_page_dirty_for_io(page); - set_page_writeback(page); } + set_page_writeback(page); unlock_page(page); /* -- cgit v1.2.3 From c2f09217a4305478c55adc9a98692488dd19cd32 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Nov 2020 17:14:06 -0800 Subject: xfs: fix missing CoW blocks writeback conversion retry In commit 7588cbeec6df, we tried to fix a race stemming from the lack of coordination between higher level code that wants to allocate and remap CoW fork extents into the data fork. Christoph cites as examples the always_cow mode, and a directio write completion racing with writeback. According to the comments before the goto retry, we want to restart the lookup to catch the extent in the data fork, but we don't actually reset whichfork or cow_fsb, which means the second try executes using stale information. Up until now I think we've gotten lucky that either there's something left in the CoW fork to cause cow_fsb to be reset, or either data/cow fork sequence numbers have advanced enough to force a fresh lookup from the data fork. However, if we reach the retry with an empty stable CoW fork and a stable data fork, neither of those things happens. The retry foolishly re-calls xfs_convert_blocks on the CoW fork which fails again. This time, we toss the write. I've recently been working on extending reflink to the realtime device. When the realtime extent size is larger than a single block, we have to force the page cache to CoW the entire rt extent if a write (or fallocate) are not aligned with the rt extent size. The strategy I've chosen to deal with this is derived from Dave's blocksize > pagesize series: dirtying around the write range, and ensuring that writeback always starts mapping on an rt extent boundary. This has brought this race front and center, since generic/522 blows up immediately. However, I'm pretty sure this is a bug outright, independent of that. Fixes: 7588cbeec6df ("xfs: retry COW fork delalloc conversion when no extent was found") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_aops.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5bf37afae5e9..4304c6416fbb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -346,8 +346,8 @@ xfs_map_blocks( ssize_t count = i_blocksize(inode); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); - xfs_fileoff_t cow_fsb = NULLFILEOFF; - int whichfork = XFS_DATA_FORK; + xfs_fileoff_t cow_fsb; + int whichfork; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; int retries = 0; @@ -381,6 +381,8 @@ xfs_map_blocks( * landed in a hole and we skip the block. */ retry: + cow_fsb = NULLFILEOFF; + whichfork = XFS_DATA_FORK; xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); -- cgit v1.2.3 From c1f6b1ac00756a7108e5fcb849a2f8230c0b62a5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Nov 2020 17:14:07 -0800 Subject: xfs: fix scrub flagging rtinherit even if there is no rt device The kernel has always allowed directories to have the rtinherit flag set, even if there is no rt device, so this check is wrong. Fixes: 80e4e1268802 ("xfs: scrub inodes") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 3aa85b64de36..bb25ff1b770d 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -121,8 +121,7 @@ xchk_inode_flags( goto bad; /* rt flags require rt device */ - if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) && - !mp->m_rtdev_targp) + if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) goto bad; /* new rt bitmap flag only valid for rbmino */ -- cgit v1.2.3 From 9d820f68b2bdba5b2e7bf135123c3f57c5051d05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2020 14:06:23 +0100 Subject: entry: Fix the incorrect ordering of lockdep and RCU check When an exception/interrupt hits kernel space and the kernel is not currently in the idle task then RCU must be watching. irqentry_enter() validates this via rcu_irq_enter_check_tick(), which in turn invokes lockdep when taking a lock. But at that point lockdep does not yet know about the fact that interrupts have been disabled by the CPU, which triggers a lockdep splat complaining about inconsistent state. Invoking trace_hardirqs_off() before rcu_irq_enter_check_tick() defeats the point of rcu_irq_enter_check_tick() because trace_hardirqs_off() uses RCU. So use the same sequence as for the idle case and tell lockdep about the irq state change first, invoke the RCU check and then do the lockdep and tracer update. Fixes: a5497bab5f72 ("entry: Provide generic interrupt entry/exit code") Reported-by: Mark Rutland Signed-off-by: Thomas Gleixner Tested-by: Mark Rutland Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87y2jhl19s.fsf@nanos.tec.linutronix.de --- kernel/entry/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 2b8366693d5c..e9e2df3f3f9e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -337,10 +337,10 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * already contains a warning when RCU is not watching, so no point * in having another one here. */ + lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); rcu_irq_enter_check_tick(); - /* Use the combo lockdep/tracing function */ - trace_hardirqs_off(); + trace_hardirqs_off_finish(); instrumentation_end(); return ret; -- cgit v1.2.3 From 3dd1680d1418f22f7ddaf98a4eab66285a099b3e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Oct 2020 09:36:41 -0600 Subject: io-wq: cancel request if it's asking for files and we don't have them This can't currently happen, but will be possible shortly. Handle missing files just like we do not being able to grab a needed mm, and mark the request as needing cancelation. Signed-off-by: Jens Axboe --- fs/io-wq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io-wq.c b/fs/io-wq.c index 02894df7656d..b53c055bea6a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -482,6 +482,10 @@ static void io_impersonate_work(struct io_worker *worker, current->files = work->identity->files; current->nsproxy = work->identity->nsproxy; task_unlock(current); + if (!work->identity->files) { + /* failed grabbing files, ensure work gets cancelled */ + work->flags |= IO_WQ_WORK_CANCEL; + } } if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) current->fs = work->identity->fs; -- cgit v1.2.3 From fdaf083cdfb556a45c422c8998268baf1ab26829 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Oct 2020 09:37:30 -0600 Subject: io_uring: properly handle SQPOLL request cancelations Track if a given task io_uring context contains SQPOLL instances, so we can iterate those for cancelation (and request counts). This ensures that we properly wait on SQPOLL contexts, and find everything that needs canceling. Signed-off-by: Jens Axboe --- fs/io_uring.c | 77 ++++++++++++++++++++++++++++++++++++++++-------- include/linux/io_uring.h | 3 +- 2 files changed, 67 insertions(+), 13 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a7429c977eb3..b398394a919e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1668,7 +1668,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) { + } else if (ctx->cq_overflow_flushed || + atomic_read(&req->task->io_uring->in_idle)) { /* * If we're in ring overflow flush mode, or in task cancel mode, * then we cannot store the request for later flushing, we need @@ -1838,7 +1839,7 @@ static void __io_free_req(struct io_kiocb *req) io_dismantle_req(req); percpu_counter_dec(&tctx->inflight); - if (tctx->in_idle) + if (atomic_read(&tctx->in_idle)) wake_up(&tctx->wait); put_task_struct(req->task); @@ -7695,7 +7696,8 @@ static int io_uring_alloc_task_context(struct task_struct *task) xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); tctx->last = NULL; - tctx->in_idle = 0; + atomic_set(&tctx->in_idle, 0); + tctx->sqpoll = false; io_init_identity(&tctx->__identity); tctx->identity = &tctx->__identity; task->io_uring = tctx; @@ -8598,8 +8600,11 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, { struct task_struct *task = current; - if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { task = ctx->sq_data->thread; + atomic_inc(&task->io_uring->in_idle); + io_sq_thread_park(ctx->sq_data); + } io_cqring_overflow_flush(ctx, true, task, files); @@ -8607,12 +8612,23 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_run_task_work(); cond_resched(); } + + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { + atomic_dec(&task->io_uring->in_idle); + /* + * If the files that are going away are the ones in the thread + * identity, clear them out. + */ + if (task->io_uring->identity->files == files) + task->io_uring->identity->files = NULL; + io_sq_thread_unpark(ctx->sq_data); + } } /* * Note that this task has used io_uring. We use it for cancelation purposes. */ -static int io_uring_add_task_file(struct file *file) +static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) { struct io_uring_task *tctx = current->io_uring; @@ -8634,6 +8650,14 @@ static int io_uring_add_task_file(struct file *file) tctx->last = file; } + /* + * This is race safe in that the task itself is doing this, hence it + * cannot be going through the exit/cancel paths at the same time. + * This cannot be modified while exit/cancel is running. + */ + if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL)) + tctx->sqpoll = true; + return 0; } @@ -8675,7 +8699,7 @@ void __io_uring_files_cancel(struct files_struct *files) unsigned long index; /* make sure overflow events are dropped */ - tctx->in_idle = true; + atomic_inc(&tctx->in_idle); xa_for_each(&tctx->xa, index, file) { struct io_ring_ctx *ctx = file->private_data; @@ -8684,6 +8708,35 @@ void __io_uring_files_cancel(struct files_struct *files) if (files) io_uring_del_task_file(file); } + + atomic_dec(&tctx->in_idle); +} + +static s64 tctx_inflight(struct io_uring_task *tctx) +{ + unsigned long index; + struct file *file; + s64 inflight; + + inflight = percpu_counter_sum(&tctx->inflight); + if (!tctx->sqpoll) + return inflight; + + /* + * If we have SQPOLL rings, then we need to iterate and find them, and + * add the pending count for those. + */ + xa_for_each(&tctx->xa, index, file) { + struct io_ring_ctx *ctx = file->private_data; + + if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_uring_task *__tctx = ctx->sqo_task->io_uring; + + inflight += percpu_counter_sum(&__tctx->inflight); + } + } + + return inflight; } /* @@ -8697,11 +8750,11 @@ void __io_uring_task_cancel(void) s64 inflight; /* make sure overflow events are dropped */ - tctx->in_idle = true; + atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ - inflight = percpu_counter_sum(&tctx->inflight); + inflight = tctx_inflight(tctx); if (!inflight) break; __io_uring_files_cancel(NULL); @@ -8712,13 +8765,13 @@ void __io_uring_task_cancel(void) * If we've seen completions, retry. This avoids a race where * a completion comes in before we did prepare_to_wait(). */ - if (inflight != percpu_counter_sum(&tctx->inflight)) + if (inflight != tctx_inflight(tctx)) continue; schedule(); } while (1); finish_wait(&tctx->wait, &wait); - tctx->in_idle = false; + atomic_dec(&tctx->in_idle); } static int io_uring_flush(struct file *file, void *data) @@ -8863,7 +8916,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_sqpoll_wait_sq(ctx); submitted = to_submit; } else if (to_submit) { - ret = io_uring_add_task_file(f.file); + ret = io_uring_add_task_file(ctx, f.file); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); @@ -9092,7 +9145,7 @@ err_fd: #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; #endif - if (unlikely(io_uring_add_task_file(file))) { + if (unlikely(io_uring_add_task_file(ctx, file))) { file = ERR_PTR(-ENOMEM); goto err_fd; } diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 868364cea3b7..35b2d845704d 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -30,7 +30,8 @@ struct io_uring_task { struct percpu_counter inflight; struct io_identity __identity; struct io_identity *identity; - bool in_idle; + atomic_t in_idle; + bool sqpoll; }; #if defined(CONFIG_IO_URING) -- cgit v1.2.3 From 4b70cf9dea4cd239b425f3282fa56ce19e234c8a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 2 Nov 2020 10:39:05 -0700 Subject: io_uring: ensure consistent view of original task ->mm from SQPOLL Ensure we get a valid view of the task mm, by using task_lock() when attempting to grab the original task mm. Reported-by: syzbot+b57abf7ee60829090495@syzkaller.appspotmail.com Fixes: 2aede0e417db ("io_uring: stash ctx task reference for SQPOLL") Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b398394a919e..fd61708dba2b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -995,20 +995,33 @@ static void io_sq_thread_drop_mm(void) if (mm) { kthread_unuse_mm(mm); mmput(mm); + current->mm = NULL; } } static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { - if (!current->mm) { - if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || - !ctx->sqo_task->mm || - !mmget_not_zero(ctx->sqo_task->mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_task->mm); + struct mm_struct *mm; + + if (current->mm) + return 0; + + /* Should never happen */ + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL))) + return -EFAULT; + + task_lock(ctx->sqo_task); + mm = ctx->sqo_task->mm; + if (unlikely(!mm || !mmget_not_zero(mm))) + mm = NULL; + task_unlock(ctx->sqo_task); + + if (mm) { + kthread_use_mm(mm); + return 0; } - return 0; + return -EFAULT; } static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, -- cgit v1.2.3 From cb8a8ae310741d743fd02982307797f6a126f614 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Nov 2020 12:19:07 -0700 Subject: io_uring: drop req/tctx io_identity separately We can't bundle this into one operation, as the identity may not have originated from the tctx to begin with. Drop one ref for each of them separately, if they don't match the static assignment. If we don't, then if the identity is a lookup from registered credentials, we could be freeing that identity as we're dropping a reference assuming it came from the tctx. syzbot reports this as a use-after-free, as the identity is still referencable from idr lookup: ================================================================== BUG: KASAN: use-after-free in instrument_atomic_read_write include/linux/instrumented.h:101 [inline] BUG: KASAN: use-after-free in atomic_fetch_add_relaxed include/asm-generic/atomic-instrumented.h:142 [inline] BUG: KASAN: use-after-free in __refcount_add include/linux/refcount.h:193 [inline] BUG: KASAN: use-after-free in __refcount_inc include/linux/refcount.h:250 [inline] BUG: KASAN: use-after-free in refcount_inc include/linux/refcount.h:267 [inline] BUG: KASAN: use-after-free in io_init_req fs/io_uring.c:6700 [inline] BUG: KASAN: use-after-free in io_submit_sqes+0x15a9/0x25f0 fs/io_uring.c:6774 Write of size 4 at addr ffff888011e08e48 by task syz-executor165/8487 CPU: 1 PID: 8487 Comm: syz-executor165 Not tainted 5.10.0-rc1-next-20201102-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x4c8 mm/kasan/report.c:385 __kasan_report mm/kasan/report.c:545 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:562 check_memory_region_inline mm/kasan/generic.c:186 [inline] check_memory_region+0x13d/0x180 mm/kasan/generic.c:192 instrument_atomic_read_write include/linux/instrumented.h:101 [inline] atomic_fetch_add_relaxed include/asm-generic/atomic-instrumented.h:142 [inline] __refcount_add include/linux/refcount.h:193 [inline] __refcount_inc include/linux/refcount.h:250 [inline] refcount_inc include/linux/refcount.h:267 [inline] io_init_req fs/io_uring.c:6700 [inline] io_submit_sqes+0x15a9/0x25f0 fs/io_uring.c:6774 __do_sys_io_uring_enter+0xc8e/0x1b50 fs/io_uring.c:9159 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x440e19 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 0f fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fff644ff178 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 0000000000440e19 RDX: 0000000000000000 RSI: 000000000000450c RDI: 0000000000000003 RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000022b4850 R13: 0000000000000010 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 8487: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track mm/kasan/common.c:56 [inline] __kasan_kmalloc.constprop.0+0xc2/0xd0 mm/kasan/common.c:461 kmalloc include/linux/slab.h:552 [inline] io_register_personality fs/io_uring.c:9638 [inline] __io_uring_register fs/io_uring.c:9874 [inline] __do_sys_io_uring_register+0x10f0/0x40a0 fs/io_uring.c:9924 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 8487: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track+0x1c/0x30 mm/kasan/common.c:56 kasan_set_free_info+0x1b/0x30 mm/kasan/generic.c:355 __kasan_slab_free+0x102/0x140 mm/kasan/common.c:422 slab_free_hook mm/slub.c:1544 [inline] slab_free_freelist_hook+0x5d/0x150 mm/slub.c:1577 slab_free mm/slub.c:3140 [inline] kfree+0xdb/0x360 mm/slub.c:4122 io_identity_cow fs/io_uring.c:1380 [inline] io_prep_async_work+0x903/0xbc0 fs/io_uring.c:1492 io_prep_async_link fs/io_uring.c:1505 [inline] io_req_defer fs/io_uring.c:5999 [inline] io_queue_sqe+0x212/0xed0 fs/io_uring.c:6448 io_submit_sqe fs/io_uring.c:6542 [inline] io_submit_sqes+0x14f6/0x25f0 fs/io_uring.c:6784 __do_sys_io_uring_enter+0xc8e/0x1b50 fs/io_uring.c:9159 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff888011e08e00 which belongs to the cache kmalloc-96 of size 96 The buggy address is located 72 bytes inside of 96-byte region [ffff888011e08e00, ffff888011e08e60) The buggy address belongs to the page: page:00000000a7104751 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11e08 flags: 0xfff00000000200(slab) raw: 00fff00000000200 ffffea00004f8540 0000001f00000002 ffff888010041780 raw: 0000000000000000 0000000080200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888011e08d00: 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc ffff888011e08d80: 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc > ffff888011e08e00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ^ ffff888011e08e80: 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc ffff888011e08f00: 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc ================================================================== Reported-by: syzbot+625ce3bb7835b63f7f3d@syzkaller.appspotmail.com Fixes: 1e6fa5216a0e ("io_uring: COW io_identity on mismatch") Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fd61708dba2b..728f3a368a01 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1287,9 +1287,12 @@ static bool io_identity_cow(struct io_kiocb *req) /* add one for this request */ refcount_inc(&id->count); - /* drop old identity, assign new one. one ref for req, one for tctx */ - if (req->work.identity != tctx->identity && - refcount_sub_and_test(2, &req->work.identity->count)) + /* drop tctx and req identity references, if needed */ + if (tctx->identity != &tctx->__identity && + refcount_dec_and_test(&tctx->identity->count)) + kfree(tctx->identity); + if (req->work.identity != &tctx->__identity && + refcount_dec_and_test(&req->work.identity->count)) kfree(req->work.identity); req->work.identity = id; -- cgit v1.2.3 From 99b328084f6a98bcee9fcd423c82ccfd52115da5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 4 Nov 2020 13:39:31 +0000 Subject: io_uring: fix overflowed cancel w/ linked ->files Current io_match_files() check in io_cqring_overflow_flush() is useless because requests drop ->files before going to the overflow list, however linked to it request do not, and we don't check them. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 728f3a368a01..d6f7f8b3837f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1593,14 +1593,29 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) } } -static inline bool io_match_files(struct io_kiocb *req, - struct files_struct *files) +static inline bool __io_match_files(struct io_kiocb *req, + struct files_struct *files) { + return ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_FILES)) && + req->work.identity->files == files; +} + +static bool io_match_files(struct io_kiocb *req, + struct files_struct *files) +{ + struct io_kiocb *link; + if (!files) return true; - if ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_FILES)) - return req->work.identity->files == files; + if (__io_match_files(req, files)) + return true; + if (req->flags & REQ_F_LINK_HEAD) { + list_for_each_entry(link, &req->link_list, link_list) { + if (__io_match_files(link, files)) + return true; + } + } return false; } @@ -8406,22 +8421,6 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) return false; } -static bool io_match_link_files(struct io_kiocb *req, - struct files_struct *files) -{ - struct io_kiocb *link; - - if (io_match_files(req, files)) - return true; - if (req->flags & REQ_F_LINK_HEAD) { - list_for_each_entry(link, &req->link_list, link_list) { - if (io_match_files(link, files)) - return true; - } - } - return false; -} - /* * We're looking to cancel 'req' because it's holding on to our files, but * 'req' could be a link to another request. See if it is, and cancel that @@ -8504,7 +8503,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_link_files(de->req, files)) { + if (io_match_files(de->req, files)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } -- cgit v1.2.3 From 62575e270f661aba64778cbc5f354511cf9abb21 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Oct 2020 09:39:06 -0400 Subject: ceph: check session state after bumping session->s_seq Some messages sent by the MDS entail a session sequence number increment, and the MDS will drop certain types of requests on the floor when the sequence numbers don't match. In particular, a REQUEST_CLOSE message can cross with one of the sequence morphing messages from the MDS which can cause the client to stall, waiting for a response that will never come. Originally, this meant an up to 5s delay before the recurring workqueue job kicked in and resent the request, but a recent change made it so that the client would never resend, causing a 60s stall unmounting and sometimes a blockisting event. Add a new helper for incrementing the session sequence and then testing to see whether a REQUEST_CLOSE needs to be resent, and move the handling of CEPH_MDS_SESSION_CLOSING into that function. Change all of the bare sequence counter increments to use the new helper. Reorganize check_session_state with a switch statement. It should no longer be called when the session is CLOSING, so throw a warning if it ever is (but still handle that case sanely). [ idryomov: whitespace, pr_err() call fixup ] URL: https://tracker.ceph.com/issues/47563 Fixes: fa9967734227 ("ceph: fix potential mdsc use-after-free crash") Reported-by: Patrick Donnelly Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- fs/ceph/mds_client.c | 50 +++++++++++++++++++++++++++++++++++--------------- fs/ceph/mds_client.h | 1 + fs/ceph/quota.c | 2 +- fs/ceph/snap.c | 2 +- 5 files changed, 39 insertions(+), 18 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5027bbdca419..ded4229c314a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4074,7 +4074,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap, inode); mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 08f1c0c31dc2..8f1d7500a7ec 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4231,7 +4231,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, dname.len, dname.name); mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); if (!inode) { dout("handle_lease no inode %llx\n", vino.ino); @@ -4385,28 +4385,48 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) bool check_session_state(struct ceph_mds_session *s) { - if (s->s_state == CEPH_MDS_SESSION_CLOSING) { - dout("resending session close request for mds%d\n", - s->s_mds); - request_close_session(s); - return false; - } - if (s->s_ttl && time_after(jiffies, s->s_ttl)) { - if (s->s_state == CEPH_MDS_SESSION_OPEN) { + switch (s->s_state) { + case CEPH_MDS_SESSION_OPEN: + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { s->s_state = CEPH_MDS_SESSION_HUNG; pr_info("mds%d hung\n", s->s_mds); } - } - if (s->s_state == CEPH_MDS_SESSION_NEW || - s->s_state == CEPH_MDS_SESSION_RESTARTING || - s->s_state == CEPH_MDS_SESSION_CLOSED || - s->s_state == CEPH_MDS_SESSION_REJECTED) - /* this mds is failed or recovering, just wait */ + break; + case CEPH_MDS_SESSION_CLOSING: + /* Should never reach this when we're unmounting */ + WARN_ON_ONCE(true); + fallthrough; + case CEPH_MDS_SESSION_NEW: + case CEPH_MDS_SESSION_RESTARTING: + case CEPH_MDS_SESSION_CLOSED: + case CEPH_MDS_SESSION_REJECTED: return false; + } return true; } +/* + * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply, + * then we need to retransmit that request. + */ +void inc_session_sequence(struct ceph_mds_session *s) +{ + lockdep_assert_held(&s->s_mutex); + + s->s_seq++; + + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { + int ret; + + dout("resending session close request for mds%d\n", s->s_mds); + ret = request_close_session(s); + if (ret < 0) + pr_err("unable to close session to mds%d: %d\n", + s->s_mds, ret); + } +} + /* * delayed work -- periodically trim expired leases, renew caps with mds */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index cbf8af437140..f5adbebcb38e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -480,6 +480,7 @@ struct ceph_mds_client { extern const char *ceph_mds_op_name(int op); extern bool check_session_state(struct ceph_mds_session *s); +void inc_session_sequence(struct ceph_mds_session *s); extern struct ceph_mds_session * __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 83cb4f26b689..9b785f11e95a 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -53,7 +53,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, /* increment msg sequence number */ mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); mutex_unlock(&session->s_mutex); /* lookup inode */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 0da39c16dab4..b611f829cb61 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -873,7 +873,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, ceph_snap_op_name(op), split, trace_len); mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); mutex_unlock(&session->s_mutex); down_write(&mdsc->snap_rwsem); -- cgit v1.2.3 From db0362eeb22992502764e825c79b922d7467e0eb Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Tue, 3 Nov 2020 13:44:25 +0100 Subject: USB: serial: option: add Telit FN980 composition 0x1055 Add the following Telit FN980 composition: 0x1055: tty, adb, tty, tty, tty, tty Signed-off-by: Daniele Palmas Link: https://lore.kernel.org/r/20201103124425.12940-1-dnlplm@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 284d086db7a2..54ca85cc920d 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1191,6 +1191,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1054, 0xff), /* Telit FT980-KS */ .driver_info = NCTRL(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1055, 0xff), /* Telit FN980 (PCIe) */ + .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), -- cgit v1.2.3 From f78331f74cacb33d87cd60376dacc5bd397959e2 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 3 Nov 2020 10:41:29 +0100 Subject: libbpf: Fix null dereference in xsk_socket__delete Fix a possible null pointer dereference in xsk_socket__delete that will occur if a null pointer is fed into the function. Fixes: 2f6324a3937f ("libbpf: Support shared umems between queues and devices") Reported-by: Andrii Nakryiko Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1604396490-12129-2-git-send-email-magnus.karlsson@gmail.com --- tools/lib/bpf/xsk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index e3c98c007825..504b7a85d445 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -891,13 +891,14 @@ int xsk_umem__delete(struct xsk_umem *umem) void xsk_socket__delete(struct xsk_socket *xsk) { size_t desc_sz = sizeof(struct xdp_desc); - struct xsk_ctx *ctx = xsk->ctx; struct xdp_mmap_offsets off; + struct xsk_ctx *ctx; int err; if (!xsk) return; + ctx = xsk->ctx; if (ctx->prog_fd != -1) { xsk_delete_bpf_maps(xsk); close(ctx->prog_fd); -- cgit v1.2.3 From 25cf73b9ff88fd4608699a0313f820758b4c252d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 3 Nov 2020 10:41:30 +0100 Subject: libbpf: Fix possible use after free in xsk_socket__delete Fix a possible use after free in xsk_socket__delete that will happen if xsk_put_ctx() frees the ctx. To fix, save the umem reference taken from the context and just use that instead. Fixes: 2f6324a3937f ("libbpf: Support shared umems between queues and devices") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1604396490-12129-3-git-send-email-magnus.karlsson@gmail.com --- tools/lib/bpf/xsk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 504b7a85d445..9bc537d0b92d 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -892,6 +892,7 @@ void xsk_socket__delete(struct xsk_socket *xsk) { size_t desc_sz = sizeof(struct xdp_desc); struct xdp_mmap_offsets off; + struct xsk_umem *umem; struct xsk_ctx *ctx; int err; @@ -899,6 +900,7 @@ void xsk_socket__delete(struct xsk_socket *xsk) return; ctx = xsk->ctx; + umem = ctx->umem; if (ctx->prog_fd != -1) { xsk_delete_bpf_maps(xsk); close(ctx->prog_fd); @@ -918,11 +920,11 @@ void xsk_socket__delete(struct xsk_socket *xsk) xsk_put_ctx(ctx); - ctx->umem->refcount--; + umem->refcount--; /* Do not close an fd that also has an associated umem connected * to it. */ - if (xsk->fd != ctx->umem->fd) + if (xsk->fd != umem->fd) close(xsk->fd); free(xsk); } -- cgit v1.2.3 From 9d750c75bd2c3fcf20a3c15378d1bc6b2d4ec31f Mon Sep 17 00:00:00 2001 From: Ryan Kosta Date: Sat, 10 Oct 2020 20:03:51 -0700 Subject: risc-v: kernel: ftrace: Fixes improper SPDX comment style Signed-off-by: Ryan Kosta Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 99e12faa5498..765b62434f30 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Linaro Limited * Author: AKASHI Takahiro -- cgit v1.2.3 From 1344a232016dbb0492be81f8517c4bf8fc1c6610 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 4 Nov 2020 22:17:42 +1100 Subject: powerpc: Use asm_goto_volatile for put_user() Andreas reported that commit ee0a49a6870e ("powerpc/uaccess: Switch __put_user_size_allowed() to __put_user_asm_goto()") broke CLONE_CHILD_SETTID. Further inspection showed that the put_user() in schedule_tail() was missing entirely, the store not emitted by the compiler. <.schedule_tail>: mflr r0 std r0,16(r1) stdu r1,-112(r1) bl <.finish_task_switch> ld r9,2496(r3) cmpdi cr7,r9,0 bne cr7,<.schedule_tail+0x60> ld r3,392(r13) ld r9,1392(r3) cmpdi cr7,r9,0 beq cr7,<.schedule_tail+0x3c> li r4,0 li r5,0 bl <.__task_pid_nr_ns> nop bl <.calculate_sigpending> nop addi r1,r1,112 ld r0,16(r1) mtlr r0 blr nop nop nop bl <.__balance_callback> b <.schedule_tail+0x1c> Notice there are no stores other than to the stack. There should be a stw in there for the store to current->set_child_tid. This is only seen with GCC 4.9 era compilers (tested with 4.9.3 and 4.9.4), and only when CONFIG_PPC_KUAP is disabled. When CONFIG_PPC_KUAP=y, the inline asm that's part of the isync() and mtspr() inlined via allow_user_access() seems to be enough to avoid the bug. We already have a macro to work around this (or a similar bug), called asm_volatile_goto which includes an empty asm block to tickle the compiler into generating the right code. So use that. With this applied the code generation looks more like it will work: <.schedule_tail>: mflr r0 std r31,-8(r1) std r0,16(r1) stdu r1,-144(r1) std r3,112(r1) bl <._mcount> nop ld r3,112(r1) bl <.finish_task_switch> ld r9,2624(r3) cmpdi cr7,r9,0 bne cr7,<.schedule_tail+0xa0> ld r3,2408(r13) ld r31,1856(r3) cmpdi cr7,r31,0 beq cr7,<.schedule_tail+0x80> li r4,0 li r5,0 bl <.__task_pid_nr_ns> nop li r9,-1 clrldi r9,r9,12 cmpld cr7,r31,r9 bgt cr7,<.schedule_tail+0x80> lis r9,16 rldicr r9,r9,32,31 subf r9,r31,r9 cmpldi cr7,r9,3 ble cr7,<.schedule_tail+0x80> li r9,0 stw r3,0(r31) <-- stw nop bl <.calculate_sigpending> nop addi r1,r1,144 ld r0,16(r1) ld r31,-8(r1) mtlr r0 blr nop bl <.__balance_callback> b <.schedule_tail+0x30> Fixes: ee0a49a6870e ("powerpc/uaccess: Switch __put_user_size_allowed() to __put_user_asm_goto()") Reported-by: Andreas Schwab Tested-by: Andreas Schwab Suggested-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201104111742.672142-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index ef5bbb705c08..501c9a79038c 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -178,7 +178,7 @@ do { \ * are no aliasing issues. */ #define __put_user_asm_goto(x, addr, label, op) \ - asm volatile goto( \ + asm_volatile_goto( \ "1: " op "%U1%X1 %0,%1 # put_user\n" \ EX_TABLE(1b, %l2) \ : \ @@ -191,7 +191,7 @@ do { \ __put_user_asm_goto(x, ptr, label, "std") #else /* __powerpc64__ */ #define __put_user_asm2_goto(x, addr, label) \ - asm volatile goto( \ + asm_volatile_goto( \ "1: stw%X1 %0, %1\n" \ "2: stw%X1 %L0, %L1\n" \ EX_TABLE(1b, %l2) \ -- cgit v1.2.3 From 46afb0628b86347933b16ac966655f74eab65c8c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Nov 2020 17:14:07 -0800 Subject: xfs: only flush the unshared range in xfs_reflink_unshare There's no reason to flush an entire file when we're unsharing part of a file. Therefore, only initiate writeback on the selected range. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R --- fs/xfs/xfs_reflink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 16098dc42add..6fa05fb78189 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1502,7 +1502,8 @@ xfs_reflink_unshare( &xfs_buffered_write_iomap_ops); if (error) goto out; - error = filemap_write_and_wait(inode->i_mapping); + + error = filemap_write_and_wait_range(inode->i_mapping, offset, len); if (error) goto out; -- cgit v1.2.3 From a1fbc6750e212c5675a4e48d7f51d44607eb8756 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 4 Oct 2020 19:04:26 +0100 Subject: btrfs: fix potential overflow in cluster_pages_for_defrag on 32bit arch On 32-bit systems, this shift will overflow for files larger than 4GB as start_index is unsigned long while the calls to btrfs_delalloc_*_space expect u64. CC: stable@vger.kernel.org # 4.4+ Fixes: df480633b891 ("btrfs: extent-tree: Switch to new delalloc space reserve and release") Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Sterba [ define the variable instead of repeating the shift ] Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ab408a23ba32..69a384145dc6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1274,6 +1274,7 @@ static int cluster_pages_for_defrag(struct inode *inode, u64 page_start; u64 page_end; u64 page_cnt; + u64 start = (u64)start_index << PAGE_SHIFT; int ret; int i; int i_done; @@ -1290,8 +1291,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - start_index << PAGE_SHIFT, - page_cnt << PAGE_SHIFT); + start, page_cnt << PAGE_SHIFT); if (ret) return ret; i_done = 0; @@ -1380,8 +1380,7 @@ again: btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start_index << PAGE_SHIFT, - (page_cnt - i_done) << PAGE_SHIFT, true); + start, (page_cnt - i_done) << PAGE_SHIFT, true); } @@ -1408,8 +1407,7 @@ out: put_page(pages[i]); } btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start_index << PAGE_SHIFT, - page_cnt << PAGE_SHIFT, true); + start, page_cnt << PAGE_SHIFT, true); btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; -- cgit v1.2.3 From e38fdb716702879a942017c85e84c0a3a9e4af96 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Oct 2020 16:57:26 -0400 Subject: btrfs: print the block rsv type when we fail our reservation To help with debugging, print the type of the block rsv when we fail to use our target block rsv in btrfs_use_block_rsv. This now produces: [ 544.672035] BTRFS: block rsv 1 returned -28 which is still cryptic without consulting the enum in block-rsv.h but I guess it's better than nothing. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ add note from Nikolay ] Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 7e1549a84fcc..bc920afe23bf 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -511,7 +511,8 @@ again: /*DEFAULT_RATELIMIT_BURST*/ 1); if (__ratelimit(&_rs)) WARN(1, KERN_DEBUG - "BTRFS: block rsv returned %d\n", ret); + "BTRFS: block rsv %d returned %d\n", + block_rsv->type, ret); } try_reserve: ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, -- cgit v1.2.3 From fca3a45d08782a2bb85e048fb8e3128b1388d7b7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Oct 2020 16:57:27 -0400 Subject: btrfs: fix min reserved size calculation in merge_reloc_root The minimum reserve size was adjusted to take into account the height of the tree we are merging, however we can have a root with a level == 0. What we want is root_level + 1 to get the number of nodes we may have to cow. This fixes the enospc_debug warning pops with btrfs/101. Nikolay: this fixes failures on btrfs/060 btrfs/062 btrfs/063 and btrfs/195 That I was seeing, the call trace was: [ 3680.515564] ------------[ cut here ]------------ [ 3680.515566] BTRFS: block rsv returned -28 [ 3680.515585] WARNING: CPU: 2 PID: 8339 at fs/btrfs/block-rsv.c:521 btrfs_use_block_rsv+0x162/0x180 [ 3680.515587] Modules linked in: [ 3680.515591] CPU: 2 PID: 8339 Comm: btrfs Tainted: G W 5.9.0-rc8-default #95 [ 3680.515593] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 [ 3680.515595] RIP: 0010:btrfs_use_block_rsv+0x162/0x180 [ 3680.515600] RSP: 0018:ffffa01ac9753910 EFLAGS: 00010282 [ 3680.515602] RAX: 0000000000000000 RBX: ffff984b34200000 RCX: 0000000000000027 [ 3680.515604] RDX: 0000000000000027 RSI: 0000000000000000 RDI: ffff984b3bd19e28 [ 3680.515606] RBP: 0000000000004000 R08: ffff984b3bd19e20 R09: 0000000000000001 [ 3680.515608] R10: 0000000000000004 R11: 0000000000000046 R12: ffff984b264fdc00 [ 3680.515609] R13: ffff984b13149000 R14: 00000000ffffffe4 R15: ffff984b34200000 [ 3680.515613] FS: 00007f4e2912b8c0(0000) GS:ffff984b3bd00000(0000) knlGS:0000000000000000 [ 3680.515615] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3680.515617] CR2: 00007fab87122150 CR3: 0000000118e42000 CR4: 00000000000006e0 [ 3680.515620] Call Trace: [ 3680.515627] btrfs_alloc_tree_block+0x8b/0x340 [ 3680.515633] ? __lock_acquire+0x51a/0xac0 [ 3680.515646] alloc_tree_block_no_bg_flush+0x4f/0x60 [ 3680.515651] __btrfs_cow_block+0x14e/0x7e0 [ 3680.515662] btrfs_cow_block+0x144/0x2c0 [ 3680.515670] merge_reloc_root+0x4d4/0x610 [ 3680.515675] ? btrfs_lookup_fs_root+0x78/0x90 [ 3680.515686] merge_reloc_roots+0xee/0x280 [ 3680.515695] relocate_block_group+0x2ce/0x5e0 [ 3680.515704] btrfs_relocate_block_group+0x16e/0x310 [ 3680.515711] btrfs_relocate_chunk+0x38/0xf0 [ 3680.515716] btrfs_shrink_device+0x200/0x560 [ 3680.515728] btrfs_rm_device+0x1ae/0x6a6 [ 3680.515744] ? _copy_from_user+0x6e/0xb0 [ 3680.515750] btrfs_ioctl+0x1afe/0x28c0 [ 3680.515755] ? find_held_lock+0x2b/0x80 [ 3680.515760] ? do_user_addr_fault+0x1f8/0x418 [ 3680.515773] ? __x64_sys_ioctl+0x77/0xb0 [ 3680.515775] __x64_sys_ioctl+0x77/0xb0 [ 3680.515781] do_syscall_64+0x31/0x70 [ 3680.515785] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported-by: Nikolay Borisov Fixes: 44d354abf33e ("btrfs: relocation: review the call sites which can be interrupted by signal") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3602806d71bd..9ba92d86da0b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1648,6 +1648,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root_item *root_item; struct btrfs_path *path; struct extent_buffer *leaf; + int reserve_level; int level; int max_level; int replaced = 0; @@ -1696,7 +1697,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * Thus the needed metadata size is at most root_level * nodesize, * and * 2 since we have two trees to COW. */ - min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2; + reserve_level = max_t(int, 1, btrfs_root_level(root_item)); + min_reserved = fs_info->nodesize * reserve_level * 2; memset(&next_key, 0, sizeof(next_key)); while (1) { -- cgit v1.2.3 From f07728d541ebefcf3d2ec7bc99a3bffd052d9f90 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 23 Oct 2020 14:26:33 +0300 Subject: btrfs: clean up NULL checks in qgroup_unreserve_range() Smatch complains that this code dereferences "entry" before checking whether it's NULL on the next line. Fortunately, rb_entry() will never return NULL so it doesn't cause a problem. We can clean up the NULL checking a bit to silence the warning and make the code more clear. Reviewed-by: Qu Wenruo Signed-off-by: Dan Carpenter Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c54ea6586632..77c54749f432 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3435,24 +3435,20 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, { struct rb_node *node; struct rb_node *next; - struct ulist_node *entry = NULL; + struct ulist_node *entry; int ret = 0; node = reserved->range_changed.root.rb_node; + if (!node) + return 0; while (node) { entry = rb_entry(node, struct ulist_node, rb_node); if (entry->val < start) node = node->rb_right; - else if (entry) - node = node->rb_left; else - break; + node = node->rb_left; } - /* Empty changeset */ - if (!entry) - return 0; - if (entry->val > start && rb_prev(&entry->rb_node)) entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, rb_node); -- cgit v1.2.3 From a4852cf268b5ae487ba18f2b24e44094afce0675 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 9 Jul 2020 11:25:40 +0200 Subject: btrfs: scrub: update message regarding read-only status Based on user feedback update the message printed when scrub fails to start due to write requirements. To make a distinction add a device id to the messages. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cf63f1e27a27..e71e7586e9eb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3866,8 +3866,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", - rcu_str_deref(dev->name)); + btrfs_err_in_rcu(fs_info, + "scrub on devid %llu: filesystem on %s is not writable", + devid, rcu_str_deref(dev->name)); ret = -EROFS; goto out; } -- cgit v1.2.3 From cf89af146b7e62af55470cf5f3ec3c56ec144a5e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 30 Oct 2020 06:53:56 +0800 Subject: btrfs: dev-replace: fail mount if we don't have replace item with target device If there is a device BTRFS_DEV_REPLACE_DEVID without the device replace item, then it means the filesystem is inconsistent state. This is either corruption or a crafted image. Fail the mount as this needs a closer look what is actually wrong. As of now if BTRFS_DEV_REPLACE_DEVID is present without the replace item, in __btrfs_free_extra_devids() we determine that there is an extra device, and free those extra devices but continue to mount the device. However, we were wrong in keeping tack of the rw_devices so the syzbot testcase failed: WARNING: CPU: 1 PID: 3612 at fs/btrfs/volumes.c:1166 close_fs_devices.part.0+0x607/0x800 fs/btrfs/volumes.c:1166 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 3612 Comm: syz-executor.2 Not tainted 5.9.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 panic+0x347/0x7c0 kernel/panic.c:231 __warn.cold+0x20/0x46 kernel/panic.c:600 report_bug+0x1bd/0x210 lib/bug.c:198 handle_bug+0x38/0x90 arch/x86/kernel/traps.c:234 exc_invalid_op+0x14/0x40 arch/x86/kernel/traps.c:254 asm_exc_invalid_op+0x12/0x20 arch/x86/include/asm/idtentry.h:536 RIP: 0010:close_fs_devices.part.0+0x607/0x800 fs/btrfs/volumes.c:1166 RSP: 0018:ffffc900091777e0 EFLAGS: 00010246 RAX: 0000000000040000 RBX: ffffffffffffffff RCX: ffffc9000c8b7000 RDX: 0000000000040000 RSI: ffffffff83097f47 RDI: 0000000000000007 RBP: dffffc0000000000 R08: 0000000000000001 R09: ffff8880988a187f R10: 0000000000000000 R11: 0000000000000001 R12: ffff88809593a130 R13: ffff88809593a1ec R14: ffff8880988a1908 R15: ffff88809593a050 close_fs_devices fs/btrfs/volumes.c:1193 [inline] btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179 open_ctree+0x4984/0x4a2d fs/btrfs/disk-io.c:3434 btrfs_fill_super fs/btrfs/super.c:1316 [inline] btrfs_mount_root.cold+0x14/0x165 fs/btrfs/super.c:1672 The fix here is, when we determine that there isn't a replace item then fail the mount if there is a replace target device (devid 0). CC: stable@vger.kernel.org # 4.19+ Reported-by: syzbot+4cfe71a4da060be47502@syzkaller.appspotmail.com Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 26 ++++++++++++++++++++++++-- fs/btrfs/volumes.c | 26 +++++++------------------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5b9e3f3ace22..10638537b9ef 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -91,6 +91,17 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { no_valid_dev_replace_entry_found: + /* + * We don't have a replace item or it's corrupted. If there is + * a replace target, fail the mount. + */ + if (btrfs_find_device(fs_info->fs_devices, + BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + btrfs_err(fs_info, + "found replace target device without a valid replace item"); + ret = -EUCLEAN; + goto out; + } ret = 0; dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; @@ -143,8 +154,19 @@ no_valid_dev_replace_entry_found: case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - dev_replace->srcdev = NULL; - dev_replace->tgtdev = NULL; + /* + * We don't have an active replace item but if there is a + * replace target, fail the mount. + */ + if (btrfs_find_device(fs_info->fs_devices, + BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + btrfs_err(fs_info, + "replace devid present without an active replace item"); + ret = -EUCLEAN; + } else { + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + } break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b1e48078c318..a6406b3b8c2b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1056,22 +1056,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, continue; } - if (device->devid == BTRFS_DEV_REPLACE_DEVID) { - /* - * In the first step, keep the device which has - * the correct fsid and the devid that is used - * for the dev_replace procedure. - * In the second step, the dev_replace state is - * read from the device tree and it is known - * whether the procedure is really active or - * not, which means whether this device is - * used or whether it should be removed. - */ - if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state)) { - continue; - } - } + /* + * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, + * in btrfs_init_dev_replace() so just continue. + */ + if (device->devid == BTRFS_DEV_REPLACE_DEVID) + continue; + if (device->bdev) { blkdev_put(device->bdev, device->mode); device->bdev = NULL; @@ -1080,9 +1071,6 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { list_del_init(&device->dev_alloc_list); clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state)) - fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; -- cgit v1.2.3 From 468600c6ec28613b756193c5f780aac062f1acdf Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Wed, 21 Oct 2020 13:36:55 +0800 Subject: btrfs: ref-verify: fix memory leak in btrfs_ref_tree_mod There is one error handling path that does not free ref, which may cause a minor memory leak. CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Dinghao Liu Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 7f03dbe5b609..78693d3dd15b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -860,6 +860,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } -- cgit v1.2.3 From 11522448e641e8f1690c9db06e01985e8e19b401 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 10 Oct 2020 15:14:30 +0000 Subject: powerpc/603: Always fault when _PAGE_ACCESSED is not set The kernel expects pte_young() to work regardless of CONFIG_SWAP. Make sure a minor fault is taken to set _PAGE_ACCESSED when it is not already set, regardless of the selection of CONFIG_SWAP. Fixes: 84de6ab0e904 ("powerpc/603: don't handle PAGE_ACCESSED in TLB miss handlers.") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a44367744de54e2315b2f1a8cbbd7f88488072e0.1602342806.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_book3s_32.S | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 5eb9eedac920..2aa16d5368e1 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -457,11 +457,7 @@ InstructionTLBMiss: cmplw 0,r1,r3 #endif mfspr r2, SPRN_SPRG_PGDIR -#ifdef CONFIG_SWAP li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC -#else - li r1,_PAGE_PRESENT | _PAGE_EXEC -#endif #if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ @@ -523,11 +519,7 @@ DataLoadTLBMiss: lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR -#ifdef CONFIG_SWAP li r1, _PAGE_PRESENT | _PAGE_ACCESSED -#else - li r1, _PAGE_PRESENT -#endif bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ @@ -603,11 +595,7 @@ DataStoreTLBMiss: lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR -#ifdef CONFIG_SWAP li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED -#else - li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT -#endif bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ -- cgit v1.2.3 From 0540b0d2ce9073fd2a736d636218faa61c99e572 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 10 Oct 2020 15:14:29 +0000 Subject: powerpc/40x: Always fault when _PAGE_ACCESSED is not set The kernel expects pte_young() to work regardless of CONFIG_SWAP. Make sure a minor fault is taken to set _PAGE_ACCESSED when it is not already set, regardless of the selection of CONFIG_SWAP. Fixes: 2c74e2586bb9 ("powerpc/40x: Rework 40x PTE access and TLB miss") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b02ca2ed2d3676a096219b48c0f69ec982a75bcf.1602342801.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_40x.S | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 44c9018aed1b..a1ae00689e0f 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -284,11 +284,7 @@ _ENTRY(saved_ksp_limit) rlwimi r11, r10, 22, 20, 29 /* Compute PTE address */ lwz r11, 0(r11) /* Get Linux PTE */ -#ifdef CONFIG_SWAP li r9, _PAGE_PRESENT | _PAGE_ACCESSED -#else - li r9, _PAGE_PRESENT -#endif andc. r9, r9, r11 /* Check permission */ bne 5f @@ -369,11 +365,7 @@ _ENTRY(saved_ksp_limit) rlwimi r11, r10, 22, 20, 29 /* Compute PTE address */ lwz r11, 0(r11) /* Get Linux PTE */ -#ifdef CONFIG_SWAP li r9, _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC -#else - li r9, _PAGE_PRESENT | _PAGE_EXEC -#endif andc. r9, r9, r11 /* Check permission */ bne 5f -- cgit v1.2.3 From 29daf869cbab69088fe1755d9dd224e99ba78b56 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 12 Oct 2020 08:54:31 +0000 Subject: powerpc/8xx: Always fault when _PAGE_ACCESSED is not set The kernel expects pte_young() to work regardless of CONFIG_SWAP. Make sure a minor fault is taken to set _PAGE_ACCESSED when it is not already set, regardless of the selection of CONFIG_SWAP. This adds at least 3 instructions to the TLB miss exception handlers fast path. Following patch will reduce this overhead. Also update the rotation instruction to the correct number of bits to reflect all changes done to _PAGE_ACCESSED over time. Fixes: d069cb4373fe ("powerpc/8xx: Don't touch ACCESSED when no SWAP.") Fixes: 5f356497c384 ("powerpc/8xx: remove unused _PAGE_WRITETHRU") Fixes: e0a8e0d90a9f ("powerpc/8xx: Handle PAGE_USER via APG bits") Fixes: 5b2753fc3e8a ("powerpc/8xx: Implementation of PAGE_EXEC") Fixes: a891c43b97d3 ("powerpc/8xx: Prepare handlers for _PAGE_HUGE for 512k pages.") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/af834e8a0f1fa97bfae65664950f0984a70c4750.1602492856.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_8xx.S | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 9f359d3fba74..6f3799a04121 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -202,9 +202,7 @@ SystemCall: InstructionTLBMiss: mtspr SPRN_SPRG_SCRATCH0, r10 -#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP) || defined(CONFIG_HUGETLBFS) mtspr SPRN_SPRG_SCRATCH1, r11 -#endif /* If we are faulting a kernel address, we have to use the * kernel page tables. @@ -238,11 +236,9 @@ InstructionTLBMiss: rlwimi r11, r10, 32 - 9, _PMD_PAGE_512K mtspr SPRN_MI_TWC, r11 #endif -#ifdef CONFIG_SWAP - rlwinm r11, r10, 32-5, _PAGE_PRESENT + rlwinm r11, r10, 32-7, _PAGE_PRESENT and r11, r11, r10 rlwimi r10, r11, 0, _PAGE_PRESENT -#endif /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 20 and 23 must be clear. * Software indicator bits 22, 24, 25, 26, and 27 must be @@ -256,9 +252,7 @@ InstructionTLBMiss: /* Restore registers */ 0: mfspr r10, SPRN_SPRG_SCRATCH0 -#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP) || defined(CONFIG_HUGETLBFS) mfspr r11, SPRN_SPRG_SCRATCH1 -#endif rfi patch_site 0b, patch__itlbmiss_exit_1 @@ -268,9 +262,7 @@ InstructionTLBMiss: addi r10, r10, 1 stw r10, (itlb_miss_counter - PAGE_OFFSET)@l(0) mfspr r10, SPRN_SPRG_SCRATCH0 -#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP) mfspr r11, SPRN_SPRG_SCRATCH1 -#endif rfi #endif @@ -316,11 +308,9 @@ DataStoreTLBMiss: * r11 = ((r10 & PRESENT) & ((r10 & ACCESSED) >> 5)); * r10 = (r10 & ~PRESENT) | r11; */ -#ifdef CONFIG_SWAP - rlwinm r11, r10, 32-5, _PAGE_PRESENT + rlwinm r11, r10, 32-7, _PAGE_PRESENT and r11, r11, r10 rlwimi r10, r11, 0, _PAGE_PRESENT -#endif /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 24, 25, 26, and 27 must be * set. All other Linux PTE bits control the behavior -- cgit v1.2.3 From 33fe43cfd9b1c20f6f9899b44bf04e91823ff1c9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 12 Oct 2020 08:54:33 +0000 Subject: powerpc/8xx: Manage _PAGE_ACCESSED through APG bits in L1 entry When _PAGE_ACCESSED is not set, a minor fault is expected. To do this, TLB miss exception ANDs _PAGE_PRESENT and _PAGE_ACCESSED into the L2 entry valid bit. To simplify the processing and reduce the number of instructions in TLB miss exceptions, manage it as an APG bit and get it next to _PAGE_GUARDED bit to allow a copy in one go. Then declare the corresponding groups as handling all accesses as user accesses. As the PP bits always define user as No Access, it will generate a fault. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/80f488db230c6b0e7b3b990d72bd94a8a069e93e.1602492856.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/kup-8xx.h | 2 +- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 47 +++++++++------------------- arch/powerpc/include/asm/nohash/32/pte-8xx.h | 9 +++--- arch/powerpc/kernel/head_8xx.S | 36 +++++---------------- 4 files changed, 28 insertions(+), 66 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index 85ed2390fb99..567cdc557402 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -63,7 +63,7 @@ static inline void restore_user_access(unsigned long flags) static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { - return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xf0000000), + return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xff000000), "Bug: fault blocked by AP register !"); } diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 1d9ac0f9c794..0bd1b144eb76 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -33,19 +33,18 @@ * respectively NA for All or X for Supervisor and no access for User. * Then we use the APG to say whether accesses are according to Page rules or * "all Supervisor" rules (Access to all) - * Therefore, we define 2 APG groups. lsb is _PMD_USER - * 0 => Kernel => 01 (all accesses performed according to page definition) - * 1 => User => 00 (all accesses performed as supervisor iaw page definition) - * 2-15 => Not Used - */ -#define MI_APG_INIT 0x40000000 - -/* - * 0 => Kernel => 01 (all accesses performed according to page definition) - * 1 => User => 10 (all accesses performed according to swaped page definition) - * 2-15 => Not Used - */ -#define MI_APG_KUEP 0x60000000 + * _PAGE_ACCESSED is also managed via APG. When _PAGE_ACCESSED is not set, say + * "all User" rules, that will lead to NA for all. + * Therefore, we define 4 APG groups. lsb is _PAGE_ACCESSED + * 0 => Kernel => 11 (all accesses performed according as user iaw page definition) + * 1 => Kernel+Accessed => 01 (all accesses performed according to page definition) + * 2 => User => 11 (all accesses performed according as user iaw page definition) + * 3 => User+Accessed => 00 (all accesses performed as supervisor iaw page definition) for INIT + * => 10 (all accesses performed according to swaped page definition) for KUEP + * 4-15 => Not Used + */ +#define MI_APG_INIT 0xdc000000 +#define MI_APG_KUEP 0xde000000 /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MI_RPN is written, bits in @@ -106,25 +105,9 @@ #define MD_Ks 0x80000000 /* Should not be set */ #define MD_Kp 0x40000000 /* Should always be set */ -/* - * All pages' PP data bits are set to either 000 or 011 or 001, which means - * respectively RW for Supervisor and no access for User, or RO for - * Supervisor and no access for user and NA for ALL. - * Then we use the APG to say whether accesses are according to Page rules or - * "all Supervisor" rules (Access to all) - * Therefore, we define 2 APG groups. lsb is _PMD_USER - * 0 => Kernel => 01 (all accesses performed according to page definition) - * 1 => User => 00 (all accesses performed as supervisor iaw page definition) - * 2-15 => Not Used - */ -#define MD_APG_INIT 0x40000000 - -/* - * 0 => No user => 01 (all accesses performed according to page definition) - * 1 => User => 10 (all accesses performed according to swaped page definition) - * 2-15 => Not Used - */ -#define MD_APG_KUAP 0x60000000 +/* See explanation above at the definition of MI_APG_INIT */ +#define MD_APG_INIT 0xdc000000 +#define MD_APG_KUAP 0xde000000 /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MD_RPN is written, bits in diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 66f403a7da44..1581204467e1 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -39,9 +39,9 @@ * into the TLB. */ #define _PAGE_GUARDED 0x0010 /* Copied to L1 G entry in DTLB */ -#define _PAGE_SPECIAL 0x0020 /* SW entry */ +#define _PAGE_ACCESSED 0x0020 /* Copied to L1 APG 1 entry in I/DTLB */ #define _PAGE_EXEC 0x0040 /* Copied to PP (bit 21) in ITLB */ -#define _PAGE_ACCESSED 0x0080 /* software: page referenced */ +#define _PAGE_SPECIAL 0x0080 /* SW entry */ #define _PAGE_NA 0x0200 /* Supervisor NA, User no access */ #define _PAGE_RO 0x0600 /* Supervisor RO, User no access */ @@ -59,11 +59,12 @@ #define _PMD_PRESENT 0x0001 #define _PMD_PRESENT_MASK _PMD_PRESENT -#define _PMD_BAD 0x0fd0 +#define _PMD_BAD 0x0f90 #define _PMD_PAGE_MASK 0x000c #define _PMD_PAGE_8M 0x000c #define _PMD_PAGE_512K 0x0004 -#define _PMD_USER 0x0020 /* APG 1 */ +#define _PMD_ACCESSED 0x0020 /* APG 1 */ +#define _PMD_USER 0x0040 /* APG 2 */ #define _PTE_NONE_MASK 0 diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 6f3799a04121..ee0bfebc375f 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -222,23 +222,13 @@ InstructionTLBMiss: 3: mtcr r11 #endif -#if defined(CONFIG_HUGETLBFS) || !defined(CONFIG_PIN_TLB_TEXT) lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r10) /* Get level 1 entry */ mtspr SPRN_MD_TWC, r11 -#else - lwz r10, (swapper_pg_dir-PAGE_OFFSET)@l(r10) /* Get level 1 entry */ - mtspr SPRN_MI_TWC, r10 /* Set segment attributes */ - mtspr SPRN_MD_TWC, r10 -#endif mfspr r10, SPRN_MD_TWC lwz r10, 0(r10) /* Get the pte */ -#if defined(CONFIG_HUGETLBFS) || !defined(CONFIG_PIN_TLB_TEXT) + rlwimi r11, r10, 0, _PAGE_GUARDED | _PAGE_ACCESSED rlwimi r11, r10, 32 - 9, _PMD_PAGE_512K mtspr SPRN_MI_TWC, r11 -#endif - rlwinm r11, r10, 32-7, _PAGE_PRESENT - and r11, r11, r10 - rlwimi r10, r11, 0, _PAGE_PRESENT /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 20 and 23 must be clear. * Software indicator bits 22, 24, 25, 26, and 27 must be @@ -289,28 +279,16 @@ DataStoreTLBMiss: mfspr r10, SPRN_MD_TWC lwz r10, 0(r10) /* Get the pte */ - /* Insert the Guarded flag into the TWC from the Linux PTE. + /* Insert Guarded and Accessed flags into the TWC from the Linux PTE. * It is bit 27 of both the Linux PTE and the TWC (at least * I got that right :-). It will be better when we can put * this into the Linux pgd/pmd and load it in the operation * above. */ - rlwimi r11, r10, 0, _PAGE_GUARDED + rlwimi r11, r10, 0, _PAGE_GUARDED | _PAGE_ACCESSED rlwimi r11, r10, 32 - 9, _PMD_PAGE_512K mtspr SPRN_MD_TWC, r11 - /* Both _PAGE_ACCESSED and _PAGE_PRESENT has to be set. - * We also need to know if the insn is a load/store, so: - * Clear _PAGE_PRESENT and load that which will - * trap into DTLB Error with store bit set accordinly. - */ - /* PRESENT=0x1, ACCESSED=0x20 - * r11 = ((r10 & PRESENT) & ((r10 & ACCESSED) >> 5)); - * r10 = (r10 & ~PRESENT) | r11; - */ - rlwinm r11, r10, 32-7, _PAGE_PRESENT - and r11, r11, r10 - rlwimi r10, r11, 0, _PAGE_PRESENT /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 24, 25, 26, and 27 must be * set. All other Linux PTE bits control the behavior @@ -701,7 +679,7 @@ initial_mmu: li r9, 4 /* up to 4 pages of 8M */ mtctr r9 lis r9, KERNELBASE@h /* Create vaddr for TLB */ - li r10, MI_PS8MEG | MI_SVALID /* Set 8M byte page */ + li r10, MI_PS8MEG | _PMD_ACCESSED | MI_SVALID li r11, MI_BOOTINIT /* Create RPN for address 0 */ 1: mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ @@ -765,7 +743,7 @@ _GLOBAL(mmu_pin_tlb) #ifdef CONFIG_PIN_TLB_TEXT LOAD_REG_IMMEDIATE(r5, 28 << 8) LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET) - LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG) + LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG | _PMD_ACCESSED) LOAD_REG_IMMEDIATE(r8, 0xf0 | _PAGE_RO | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT) LOAD_REG_ADDR(r9, _sinittext) li r0, 4 @@ -787,7 +765,7 @@ _GLOBAL(mmu_pin_tlb) LOAD_REG_IMMEDIATE(r5, 28 << 8 | MD_TWAM) #ifdef CONFIG_PIN_TLB_DATA LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET) - LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG) + LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG | _PMD_ACCESSED) #ifdef CONFIG_PIN_TLB_IMMR li r0, 3 #else @@ -824,7 +802,7 @@ _GLOBAL(mmu_pin_tlb) #endif #ifdef CONFIG_PIN_TLB_IMMR LOAD_REG_IMMEDIATE(r0, VIRT_IMMR_BASE | MD_EVALID) - LOAD_REG_IMMEDIATE(r7, MD_SVALID | MD_PS512K | MD_GUARDED) + LOAD_REG_IMMEDIATE(r7, MD_SVALID | MD_PS512K | MD_GUARDED | _PMD_ACCESSED) mfspr r8, SPRN_IMMR rlwinm r8, r8, 0, 0xfff80000 ori r8, r8, 0xf0 | _PAGE_DIRTY | _PAGE_SPS | _PAGE_SH | \ -- cgit v1.2.3 From ef9865a442286e2737f37f56eb54c12ef8465905 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 5 Nov 2020 14:06:19 +0000 Subject: io_uring: don't forget to task-cancel drained reqs If there is a long-standing request of one task locking up execution of deferred requests, and the defer list contains requests of another task (all files-less), then a potential execution of __io_uring_task_cancel() by that another task will sleep until that first long-standing request completion, and that may take long. E.g. tsk1: req1/read(empty_pipe) -> tsk2: req(DRAIN) Then __io_uring_task_cancel(tsk2) waits for req1 completion. It seems we even can manufacture a complicated case with many tasks sharing many rings that can lock them forever. Cancel deferred requests for __io_uring_task_cancel() as well. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d6f7f8b3837f..3d489cf31926 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8496,6 +8496,7 @@ static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) } static void io_cancel_defer_files(struct io_ring_ctx *ctx, + struct task_struct *task, struct files_struct *files) { struct io_defer_entry *de = NULL; @@ -8503,7 +8504,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_files(de->req, files)) { + if (io_task_match(de->req, task) && + io_match_files(de->req, files)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } @@ -8529,7 +8531,6 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, if (list_empty_careful(&ctx->inflight_list)) return false; - io_cancel_defer_files(ctx, files); /* cancel all at once, should be faster than doing it one by one*/ io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); @@ -8621,6 +8622,11 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_sq_thread_park(ctx->sq_data); } + if (files) + io_cancel_defer_files(ctx, NULL, files); + else + io_cancel_defer_files(ctx, task, NULL); + io_cqring_overflow_flush(ctx, true, task, files); while (__io_uring_cancel_task_requests(ctx, task, files)) { -- cgit v1.2.3 From 6b47ab81c9a9b56a94882815e9949d40e4207c92 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Nov 2020 09:50:16 -0700 Subject: io_uring: use correct pointer for io_uring_show_cred() Previous commit changed how we index the registered credentials, but neglected to update one spot that is used when the personalities are iterated through ->show_fdinfo(). Ensure we use the right struct type for the iteration. Reported-by: syzbot+a6d494688cdb797bdfce@syzkaller.appspotmail.com Fixes: 1e6fa5216a0e ("io_uring: COW io_identity on mismatch") Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3d489cf31926..29f1417690d5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8974,7 +8974,8 @@ out_fput: #ifdef CONFIG_PROC_FS static int io_uring_show_cred(int id, void *p, void *data) { - const struct cred *cred = p; + struct io_identity *iod = p; + const struct cred *cred = iod->creds; struct seq_file *m = data; struct user_namespace *uns = seq_user_ns(m); struct group_info *gi; -- cgit v1.2.3 From f3ae6c6e8a3ea49076d826c64e63ea78fbf9db43 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:30 +0300 Subject: selftests: proc: fix warning: _GNU_SOURCE redefined Makefile already contains -D_GNU_SOURCE, so we can remove it from the *.c files. Signed-off-by: Tommi Rantala Signed-off-by: Shuah Khan --- tools/testing/selftests/proc/proc-loadavg-001.c | 1 - tools/testing/selftests/proc/proc-self-syscall.c | 1 - tools/testing/selftests/proc/proc-uptime-002.c | 1 - 3 files changed, 3 deletions(-) diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c index 471e2aa28077..fb4fe9188806 100644 --- a/tools/testing/selftests/proc/proc-loadavg-001.c +++ b/tools/testing/selftests/proc/proc-loadavg-001.c @@ -14,7 +14,6 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* Test that /proc/loadavg correctly reports last pid in pid namespace. */ -#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c index 9f6d000c0245..8511dcfe67c7 100644 --- a/tools/testing/selftests/proc/proc-self-syscall.c +++ b/tools/testing/selftests/proc/proc-self-syscall.c @@ -13,7 +13,6 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/proc/proc-uptime-002.c b/tools/testing/selftests/proc/proc-uptime-002.c index 30e2b7849089..e7ceabed7f51 100644 --- a/tools/testing/selftests/proc/proc-uptime-002.c +++ b/tools/testing/selftests/proc/proc-uptime-002.c @@ -15,7 +15,6 @@ */ // Test that values in /proc/uptime increment monotonically // while shifting across CPUs. -#define _GNU_SOURCE #undef NDEBUG #include #include -- cgit v1.2.3 From 1d44d0dd61b6121b49f25b731f2f7f605cb3c896 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:31 +0300 Subject: selftests: core: use SKIP instead of XFAIL in close_range_test.c XFAIL is gone since commit 9847d24af95c ("selftests/harness: Refactor XFAIL into SKIP"), use SKIP instead. Fixes: 9847d24af95c ("selftests/harness: Refactor XFAIL into SKIP") Signed-off-by: Tommi Rantala Reviewed-by: Kees Cook Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/core/close_range_test.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index c99b98b0d461..575b391ddc78 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -44,7 +44,7 @@ TEST(close_range) fd = open("/dev/null", O_RDONLY | O_CLOEXEC); ASSERT_GE(fd, 0) { if (errno == ENOENT) - XFAIL(return, "Skipping test since /dev/null does not exist"); + SKIP(return, "Skipping test since /dev/null does not exist"); } open_fds[i] = fd; @@ -52,7 +52,7 @@ TEST(close_range) EXPECT_EQ(-1, sys_close_range(open_fds[0], open_fds[100], -1)) { if (errno == ENOSYS) - XFAIL(return, "close_range() syscall not supported"); + SKIP(return, "close_range() syscall not supported"); } EXPECT_EQ(0, sys_close_range(open_fds[0], open_fds[50], 0)); @@ -108,7 +108,7 @@ TEST(close_range_unshare) fd = open("/dev/null", O_RDONLY | O_CLOEXEC); ASSERT_GE(fd, 0) { if (errno == ENOENT) - XFAIL(return, "Skipping test since /dev/null does not exist"); + SKIP(return, "Skipping test since /dev/null does not exist"); } open_fds[i] = fd; @@ -197,7 +197,7 @@ TEST(close_range_unshare_capped) fd = open("/dev/null", O_RDONLY | O_CLOEXEC); ASSERT_GE(fd, 0) { if (errno == ENOENT) - XFAIL(return, "Skipping test since /dev/null does not exist"); + SKIP(return, "Skipping test since /dev/null does not exist"); } open_fds[i] = fd; -- cgit v1.2.3 From afba8b0a2cc532b54eaf4254092f57bba5d7eb65 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:32 +0300 Subject: selftests: clone3: use SKIP instead of XFAIL XFAIL is gone since commit 9847d24af95c ("selftests/harness: Refactor XFAIL into SKIP"), use SKIP instead. Fixes: 9847d24af95c ("selftests/harness: Refactor XFAIL into SKIP") Signed-off-by: Tommi Rantala Reviewed-by: Kees Cook Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c index 55bd387ce7ec..52d3f0364bda 100644 --- a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c +++ b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c @@ -145,7 +145,7 @@ TEST(clone3_cap_checkpoint_restore) test_clone3_supported(); EXPECT_EQ(getuid(), 0) - XFAIL(return, "Skipping all tests as non-root\n"); + SKIP(return, "Skipping all tests as non-root"); memset(&set_tid, 0, sizeof(set_tid)); -- cgit v1.2.3 From 7d764b685ee1bc73a9fa2b6cb4d42fa72b943145 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Thu, 8 Oct 2020 15:26:33 +0300 Subject: selftests: binderfs: use SKIP instead of XFAIL XFAIL is gone since commit 9847d24af95c ("selftests/harness: Refactor XFAIL into SKIP"), use SKIP instead. Fixes: 9847d24af95c ("selftests/harness: Refactor XFAIL into SKIP") Signed-off-by: Tommi Rantala Reviewed-by: Kees Cook Acked-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/filesystems/binderfs/binderfs_test.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c index 1d27f52c61e6..477cbb042f5b 100644 --- a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c +++ b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c @@ -74,7 +74,7 @@ static int __do_binderfs_test(struct __test_metadata *_metadata) ret = mount(NULL, binderfs_mntpt, "binder", 0, 0); EXPECT_EQ(ret, 0) { if (errno == ENODEV) - XFAIL(goto out, "binderfs missing"); + SKIP(goto out, "binderfs missing"); TH_LOG("%s - Failed to mount binderfs", strerror(errno)); goto rmdir; } @@ -475,10 +475,10 @@ TEST(binderfs_stress) TEST(binderfs_test_privileged) { if (geteuid() != 0) - XFAIL(return, "Tests are not run as root. Skipping privileged tests"); + SKIP(return, "Tests are not run as root. Skipping privileged tests"); if (__do_binderfs_test(_metadata)) - XFAIL(return, "The Android binderfs filesystem is not available"); + SKIP(return, "The Android binderfs filesystem is not available"); } TEST(binderfs_test_unprivileged) @@ -511,7 +511,7 @@ TEST(binderfs_test_unprivileged) ret = wait_for_pid(pid); if (ret) { if (ret == 2) - XFAIL(return, "The Android binderfs filesystem is not available"); + SKIP(return, "The Android binderfs filesystem is not available"); ASSERT_EQ(ret, 0) { TH_LOG("wait_for_pid() failed"); } -- cgit v1.2.3 From 1bd14a66ee5200d6a24419cbd2e0a0fccd4da36f Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 7 Oct 2020 14:51:59 -0700 Subject: RISC-V: Remove any memblock representing unusable memory area RISC-V limits the physical memory size by -PAGE_OFFSET. Any memory beyond that size from DRAM start is unusable. Just remove any memblock pointing to those memory region without worrying about computing the maximum size. Signed-off-by: Atish Patra Reviewed-by: Mike Rapoport Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index ea933b789a88..42c14eb6bc43 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -154,9 +154,8 @@ disable: void __init setup_bootmem(void) { - phys_addr_t mem_size = 0; - phys_addr_t total_mem = 0; - phys_addr_t mem_start, start, end = 0; + phys_addr_t mem_start = 0; + phys_addr_t start, end = 0; phys_addr_t vmlinux_end = __pa_symbol(&_end); phys_addr_t vmlinux_start = __pa_symbol(&_start); u64 i; @@ -164,21 +163,18 @@ void __init setup_bootmem(void) /* Find the memory region containing the kernel */ for_each_mem_range(i, &start, &end) { phys_addr_t size = end - start; - if (!total_mem) + if (!mem_start) mem_start = start; if (start <= vmlinux_start && vmlinux_end <= end) BUG_ON(size == 0); - total_mem = total_mem + size; } /* - * Remove memblock from the end of usable area to the - * end of region + * The maximal physical memory size is -PAGE_OFFSET. + * Make sure that any memory beyond mem_start + (-PAGE_OFFSET) is removed + * as it is unusable by kernel. */ - mem_size = min(total_mem, (phys_addr_t)-PAGE_OFFSET); - if (mem_start + mem_size < end) - memblock_remove(mem_start + mem_size, - end - mem_start - mem_size); + memblock_enforce_memory_limit(mem_start - PAGE_OFFSET); /* Reserve from the start of the kernel to the end of the kernel */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); -- cgit v1.2.3 From e68e28b4a9d71261e3f8fd05a72d6cf0b443a493 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Wed, 30 Sep 2020 16:31:11 +0300 Subject: net/mlx5e: Fix modify header actions memory leak Modify header actions are allocated during parse tc actions and only freed during the flow creation, however, on error flow the allocated memory is wrongly unfreed. Fix this by calling dealloc_mod_hdr_actions in __mlx5e_add_fdb_flow and mlx5e_add_nic_flow error flow. Fixes: d7e75a325cb2 ("net/mlx5e: Add offloading of E-Switch TC pedit (header re-write) actions") Fixes: 2f4fe4cab073 ("net/mlx5e: Add offloading of NIC TC pedit (header re-write) actions") Signed-off-by: Maor Dickman Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index e3a968e9e2a0..2e2fa0440032 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4658,6 +4658,7 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv *priv, return flow; err_free: + dealloc_mod_hdr_actions(&parse_attr->mod_hdr_acts); mlx5e_flow_put(priv, flow); out: return ERR_PTR(err); @@ -4802,6 +4803,7 @@ mlx5e_add_nic_flow(struct mlx5e_priv *priv, return 0; err_free: + dealloc_mod_hdr_actions(&parse_attr->mod_hdr_acts); mlx5e_flow_put(priv, flow); out: return err; -- cgit v1.2.3 From 78c906e430b13d30a8cfbdef4ccbbe1686841a9e Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 31 Aug 2020 16:17:29 +0300 Subject: net/mlx5e: Protect encap route dev from concurrent release In functions mlx5e_route_lookup_ipv{4|6}() route_dev can be arbitrary net device and not necessary mlx5 eswitch port representor. As such, in order to ensure that route_dev is not destroyed concurrent the code needs either explicitly take reference to the device before releasing reference to rtable instance or ensure that caller holds rtnl lock. First approach is chosen as a fix since rtnl lock dependency was intentionally removed from mlx5 TC layer. To prevent unprotected usage of route_dev in encap code take a reference to the device before releasing rt. Don't save direct pointer to the device in mlx5_encap_entry structure and use ifindex instead. Modify users of route_dev pointer to properly obtain the net device instance from its ifindex. Fixes: 61086f391044 ("net/mlx5e: Protect encap hash table with mutex") Fixes: 6707f74be862 ("net/mlx5e: Update hw flows when encap source mac changed") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 6 +- .../net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 72 ++++++++++++++-------- drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 2 +- 3 files changed, 52 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index e36e505d38ad..d29af7b9c695 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -107,12 +107,16 @@ void mlx5e_rep_update_flows(struct mlx5e_priv *priv, mlx5e_tc_encap_flows_del(priv, e, &flow_list); if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) { + struct net_device *route_dev; + ether_addr_copy(e->h_dest, ha); ether_addr_copy(eth->h_dest, ha); /* Update the encap source mac, in case that we delete * the flows when encap source mac changed. */ - ether_addr_copy(eth->h_source, e->route_dev->dev_addr); + route_dev = __dev_get_by_index(dev_net(priv->netdev), e->route_dev_ifindex); + if (route_dev) + ether_addr_copy(eth->h_source, route_dev->dev_addr); mlx5e_tc_encap_flows_add(priv, e, &flow_list); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 7cce85faa16f..90930e54b6f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -77,13 +77,13 @@ static int get_route_and_out_devs(struct mlx5e_priv *priv, return 0; } -static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, - struct net_device *mirred_dev, - struct net_device **out_dev, - struct net_device **route_dev, - struct flowi4 *fl4, - struct neighbour **out_n, - u8 *out_ttl) +static int mlx5e_route_lookup_ipv4_get(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct net_device **out_dev, + struct net_device **route_dev, + struct flowi4 *fl4, + struct neighbour **out_n, + u8 *out_ttl) { struct neighbour *n; struct rtable *rt; @@ -117,18 +117,28 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, ip_rt_put(rt); return ret; } + dev_hold(*route_dev); if (!(*out_ttl)) *out_ttl = ip4_dst_hoplimit(&rt->dst); n = dst_neigh_lookup(&rt->dst, &fl4->daddr); ip_rt_put(rt); - if (!n) + if (!n) { + dev_put(*route_dev); return -ENOMEM; + } *out_n = n; return 0; } +static void mlx5e_route_lookup_ipv4_put(struct net_device *route_dev, + struct neighbour *n) +{ + neigh_release(n); + dev_put(route_dev); +} + static const char *mlx5e_netdev_kind(struct net_device *dev) { if (dev->rtnl_link_ops) @@ -193,8 +203,8 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, fl4.saddr = tun_key->u.ipv4.src; ttl = tun_key->ttl; - err = mlx5e_route_lookup_ipv4(priv, mirred_dev, &out_dev, &route_dev, - &fl4, &n, &ttl); + err = mlx5e_route_lookup_ipv4_get(priv, mirred_dev, &out_dev, &route_dev, + &fl4, &n, &ttl); if (err) return err; @@ -223,7 +233,7 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, e->m_neigh.family = n->ops->family; memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len); e->out_dev = out_dev; - e->route_dev = route_dev; + e->route_dev_ifindex = route_dev->ifindex; /* It's important to add the neigh to the hash table before checking * the neigh validity state. So if we'll get a notification, in case the @@ -278,7 +288,7 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev)); - neigh_release(n); + mlx5e_route_lookup_ipv4_put(route_dev, n); return err; destroy_neigh_entry: @@ -286,18 +296,18 @@ destroy_neigh_entry: free_encap: kfree(encap_header); release_neigh: - neigh_release(n); + mlx5e_route_lookup_ipv4_put(route_dev, n); return err; } #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) -static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv, - struct net_device *mirred_dev, - struct net_device **out_dev, - struct net_device **route_dev, - struct flowi6 *fl6, - struct neighbour **out_n, - u8 *out_ttl) +static int mlx5e_route_lookup_ipv6_get(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct net_device **out_dev, + struct net_device **route_dev, + struct flowi6 *fl6, + struct neighbour **out_n, + u8 *out_ttl) { struct dst_entry *dst; struct neighbour *n; @@ -318,15 +328,25 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv, return ret; } + dev_hold(*route_dev); n = dst_neigh_lookup(dst, &fl6->daddr); dst_release(dst); - if (!n) + if (!n) { + dev_put(*route_dev); return -ENOMEM; + } *out_n = n; return 0; } +static void mlx5e_route_lookup_ipv6_put(struct net_device *route_dev, + struct neighbour *n) +{ + neigh_release(n); + dev_put(route_dev); +} + int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, struct net_device *mirred_dev, struct mlx5e_encap_entry *e) @@ -348,8 +368,8 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, fl6.daddr = tun_key->u.ipv6.dst; fl6.saddr = tun_key->u.ipv6.src; - err = mlx5e_route_lookup_ipv6(priv, mirred_dev, &out_dev, &route_dev, - &fl6, &n, &ttl); + err = mlx5e_route_lookup_ipv6_get(priv, mirred_dev, &out_dev, &route_dev, + &fl6, &n, &ttl); if (err) return err; @@ -378,7 +398,7 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, e->m_neigh.family = n->ops->family; memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len); e->out_dev = out_dev; - e->route_dev = route_dev; + e->route_dev_ifindex = route_dev->ifindex; /* It's importent to add the neigh to the hash table before checking * the neigh validity state. So if we'll get a notification, in case the @@ -433,7 +453,7 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev)); - neigh_release(n); + mlx5e_route_lookup_ipv6_put(route_dev, n); return err; destroy_neigh_entry: @@ -441,7 +461,7 @@ destroy_neigh_entry: free_encap: kfree(encap_header); release_neigh: - neigh_release(n); + mlx5e_route_lookup_ipv6_put(route_dev, n); return err; } #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index 9020d1419bcf..8932c387d46a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -186,7 +186,7 @@ struct mlx5e_encap_entry { unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ struct net_device *out_dev; - struct net_device *route_dev; + int route_dev_ifindex; struct mlx5e_tc_tunnel *tunnel; int reformat_type; u8 flags; -- cgit v1.2.3 From f42139ba49791ab6b12443c60044872705b74a1e Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 8 Oct 2020 11:34:03 +0300 Subject: net/mlx5e: Use spin_lock_bh for async_icosq_lock async_icosq_lock may be taken from softirq and non-softirq contexts. It requires protection with spin_lock_bh, otherwise a softirq may be triggered in the middle of the critical section, and it may deadlock if it tries to take the same lock. This patch fixes such a scenario by using spin_lock_bh to disable softirqs on that CPU while inside the critical section. Fixes: 8d94b590f1e4 ("net/mlx5e: Turn XSK ICOSQ into a general asynchronous one") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index 4e574ac73019..be3465ba38ca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -122,9 +122,9 @@ void mlx5e_activate_xsk(struct mlx5e_channel *c) set_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state); /* TX queue is created active. */ - spin_lock(&c->async_icosq_lock); + spin_lock_bh(&c->async_icosq_lock); mlx5e_trigger_irq(&c->async_icosq); - spin_unlock(&c->async_icosq_lock); + spin_unlock_bh(&c->async_icosq_lock); } void mlx5e_deactivate_xsk(struct mlx5e_channel *c) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c index fb671a457129..8e96260fce1d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c @@ -36,9 +36,9 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state)) return 0; - spin_lock(&c->async_icosq_lock); + spin_lock_bh(&c->async_icosq_lock); mlx5e_trigger_irq(&c->async_icosq); - spin_unlock(&c->async_icosq_lock); + spin_unlock_bh(&c->async_icosq_lock); } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index ccaccb9fc2f7..7f6221b8b1f7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -188,7 +188,7 @@ static int post_rx_param_wqes(struct mlx5e_channel *c, err = 0; sq = &c->async_icosq; - spin_lock(&c->async_icosq_lock); + spin_lock_bh(&c->async_icosq_lock); cseg = post_static_params(sq, priv_rx); if (IS_ERR(cseg)) @@ -199,7 +199,7 @@ static int post_rx_param_wqes(struct mlx5e_channel *c, mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); unlock: - spin_unlock(&c->async_icosq_lock); + spin_unlock_bh(&c->async_icosq_lock); return err; @@ -265,10 +265,10 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq, BUILD_BUG_ON(MLX5E_KTLS_GET_PROGRESS_WQEBBS != 1); - spin_lock(&sq->channel->async_icosq_lock); + spin_lock_bh(&sq->channel->async_icosq_lock); if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, 1))) { - spin_unlock(&sq->channel->async_icosq_lock); + spin_unlock_bh(&sq->channel->async_icosq_lock); err = -ENOSPC; goto err_dma_unmap; } @@ -299,7 +299,7 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq, icosq_fill_wi(sq, pi, &wi); sq->pc++; mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); - spin_unlock(&sq->channel->async_icosq_lock); + spin_unlock_bh(&sq->channel->async_icosq_lock); return 0; @@ -360,7 +360,7 @@ static int resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_rx err = 0; sq = &c->async_icosq; - spin_lock(&c->async_icosq_lock); + spin_lock_bh(&c->async_icosq_lock); cseg = post_static_params(sq, priv_rx); if (IS_ERR(cseg)) { @@ -372,7 +372,7 @@ static int resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_rx mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); priv_rx->stats->tls_resync_res_ok++; unlock: - spin_unlock(&c->async_icosq_lock); + spin_unlock_bh(&c->async_icosq_lock); return err; } -- cgit v1.2.3 From 465e7baab6d93b399344f5868f84c177ab5cd16f Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 21 Oct 2020 08:42:49 +0300 Subject: net/mlx5: Fix deletion of duplicate rules When a rule is duplicated, the refcount of the rule is increased so only the second deletion of the rule should cause destruction of the FTE. Currently, the FTE will be destroyed in the first deletion of rule since the modify_mask will be 0. Fix it and call to destroy FTE only if all the rules (FTE's children) have been removed. Fixes: 718ce4d601db ("net/mlx5: Consolidate update FTE for all removal changes") Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 16091838bfcf..325a5b0d6829 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2010,10 +2010,11 @@ void mlx5_del_flow_rules(struct mlx5_flow_handle *handle) down_write_ref_node(&fte->node, false); for (i = handle->num_rules - 1; i >= 0; i--) tree_remove_node(&handle->rule[i]->node, true); - if (fte->modify_mask && fte->dests_size) { - modify_fte(fte); + if (fte->dests_size) { + if (fte->modify_mask) + modify_fte(fte); up_write_ref_node(&fte->node, false); - } else { + } else if (list_empty(&fte->node.children)) { del_hw_fte(&fte->node); /* Avoid double call to del_hw_fte */ fte->node.del_hw_func = NULL; -- cgit v1.2.3 From ae35859445607f7f18dd4f332749219cd636ed59 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 2 Nov 2020 12:41:28 +0200 Subject: net/mlx5: E-switch, Avoid extack error log for disabled vport When E-switch vport is disabled, querying its hardware address is unsupported. Avoid setting extack error log message in such case. Fixes: f099fde16db3 ("net/mlx5: E-switch, Support querying port function mac address") Signed-off-by: Parav Pandit Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 6e6a9a563992..e8e6294c7cca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1902,8 +1902,6 @@ int mlx5_devlink_port_function_hw_addr_get(struct devlink *devlink, ether_addr_copy(hw_addr, vport->info.mac); *hw_addr_len = ETH_ALEN; err = 0; - } else { - NL_SET_ERR_MSG_MOD(extack, "Eswitch vport is disabled"); } mutex_unlock(&esw->state_lock); return err; -- cgit v1.2.3 From c5eb51adf06b2644fa28d4af886bfdcc53e288da Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 23 Sep 2020 12:58:44 +0300 Subject: net/mlx5e: Fix VXLAN synchronization after function reload During driver reload, perform firmware tear-down which results in firmware losing the configured VXLAN ports. These ports are still available in the driver's database. Fix this by cleaning up driver's VXLAN database in the nic unload flow, before firmware tear-down. With that, minimize mlx5_vxlan_destroy() to remove only what was added in mlx5_vxlan_create() and warn on leftover UDP ports. Fixes: 18a2b7f969c9 ("net/mlx5: convert to new udp_tunnel infrastructure") Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 + .../net/ethernet/mellanox/mlx5/core/lib/vxlan.c | 23 ++++++++++++++++------ .../net/ethernet/mellanox/mlx5/core/lib/vxlan.h | 2 ++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b3f02aac7f26..ebce97921e03 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5253,6 +5253,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv) mlx5e_disable_async_events(priv); mlx5_lag_remove(mdev); + mlx5_vxlan_reset_to_default(mdev->vxlan); } int mlx5e_update_nic_rx(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c index 3315afe2f8dc..38084400ee8f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c @@ -167,6 +167,17 @@ struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev) } void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) +{ + if (!mlx5_vxlan_allowed(vxlan)) + return; + + mlx5_vxlan_del_port(vxlan, IANA_VXLAN_UDP_PORT); + WARN_ON(!hash_empty(vxlan->htable)); + + kfree(vxlan); +} + +void mlx5_vxlan_reset_to_default(struct mlx5_vxlan *vxlan) { struct mlx5_vxlan_port *vxlanp; struct hlist_node *tmp; @@ -175,12 +186,12 @@ void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) if (!mlx5_vxlan_allowed(vxlan)) return; - /* Lockless since we are the only hash table consumers*/ hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) { - hash_del(&vxlanp->hlist); - mlx5_vxlan_core_del_port_cmd(vxlan->mdev, vxlanp->udp_port); - kfree(vxlanp); + /* Don't delete default UDP port added by the HW. + * Remove only user configured ports + */ + if (vxlanp->udp_port == IANA_VXLAN_UDP_PORT) + continue; + mlx5_vxlan_del_port(vxlan, vxlanp->udp_port); } - - kfree(vxlan); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h index ec766529f49b..34ef662da35e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h @@ -56,6 +56,7 @@ void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan); int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port); int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port); bool mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port); +void mlx5_vxlan_reset_to_default(struct mlx5_vxlan *vxlan); #else static inline struct mlx5_vxlan* mlx5_vxlan_create(struct mlx5_core_dev *mdev) { return ERR_PTR(-EOPNOTSUPP); } @@ -63,6 +64,7 @@ static inline void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) { return; } static inline int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port) { return -EOPNOTSUPP; } static inline int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port) { return -EOPNOTSUPP; } static inline bool mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port) { return false; } +static inline void mlx5_vxlan_reset_to_default(struct mlx5_vxlan *vxlan) { return; } #endif #endif /* __MLX5_VXLAN_H__ */ -- cgit v1.2.3 From 1a50cf9a67ff2241c2949d30bc11c8dd4280eef8 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 22 Oct 2020 12:49:51 +0300 Subject: net/mlx5e: Fix incorrect access of RCU-protected xdp_prog rq->xdp_prog is RCU-protected and should be accessed only with rcu_access_pointer for the NULL check in mlx5e_poll_rx_cq. rq->xdp_prog may change on the fly only from one non-NULL value to another non-NULL value, so the checks in mlx5e_xdp_handle and mlx5e_poll_rx_cq will have the same result during one NAPI cycle, meaning that no additional synchronization is needed. Fixes: fe45386a2082 ("net/mlx5e: Use RCU to protect rq->xdp_prog") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 599f5b5ebc97..6628a0197b4e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1584,7 +1584,7 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget) } while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(cqwq))); out: - if (rq->xdp_prog) + if (rcu_access_pointer(rq->xdp_prog)) mlx5e_xdp_rx_poll_complete(rq); mlx5_cqwq_update_db_record(cqwq); -- cgit v1.2.3 From 1978b3a53a74e3230cd46932b149c6e62e832e9a Mon Sep 17 00:00:00 2001 From: Anand K Mistry Date: Thu, 5 Nov 2020 16:33:04 +1100 Subject: x86/speculation: Allow IBPB to be conditionally enabled on CPUs with always-on STIBP On AMD CPUs which have the feature X86_FEATURE_AMD_STIBP_ALWAYS_ON, STIBP is set to on and spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED At the same time, IBPB can be set to conditional. However, this leads to the case where it's impossible to turn on IBPB for a process because in the PR_SPEC_DISABLE case in ib_prctl_set() the spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED condition leads to a return before the task flag is set. Similarly, ib_prctl_get() will return PR_SPEC_DISABLE even though IBPB is set to conditional. More generally, the following cases are possible: 1. STIBP = conditional && IBPB = on for spectre_v2_user=seccomp,ibpb 2. STIBP = on && IBPB = conditional for AMD CPUs with X86_FEATURE_AMD_STIBP_ALWAYS_ON The first case functions correctly today, but only because spectre_v2_user_ibpb isn't updated to reflect the IBPB mode. At a high level, this change does one thing. If either STIBP or IBPB is set to conditional, allow the prctl to change the task flag. Also, reflect that capability when querying the state. This isn't perfect since it doesn't take into account if only STIBP or IBPB is unconditionally on. But it allows the conditional feature to work as expected, without affecting the unconditional one. [ bp: Massage commit message and comment; space out statements for better readability. ] Fixes: 21998a351512 ("x86/speculation: Avoid force-disabling IBPB based on STIBP and enhanced IBRS.") Signed-off-by: Anand K Mistry Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Acked-by: Tom Lendacky Link: https://lkml.kernel.org/r/20201105163246.v2.1.Ifd7243cd3e2c2206a893ad0a5b9a4f19549e22c6@changeid --- arch/x86/kernel/cpu/bugs.c | 51 ++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d3f0db463f96..581fb7223ad0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1254,6 +1254,14 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) return 0; } +static bool is_spec_ib_user_controlled(void) +{ + return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP; +} + static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { @@ -1261,16 +1269,26 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; + /* - * Indirect branch speculation is always disabled in strict - * mode. It can neither be enabled if it was force-disabled - * by a previous prctl call. + * With strict mode for both IBPB and STIBP, the instruction + * code paths avoid checking this task flag and instead, + * unconditionally run the instruction. However, STIBP and IBPB + * are independent and either can be set to conditionally + * enabled regardless of the mode of the other. + * + * If either is set to conditional, allow the task flag to be + * updated, unless it was force-disabled by a previous prctl + * call. Currently, this is possible on an AMD CPU which has the + * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the + * kernel is booted with 'spectre_v2_user=seccomp', then + * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and + * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED. */ - if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + if (!is_spec_ib_user_controlled() || task_spec_ib_force_disable(task)) return -EPERM; + task_clear_spec_ib_disable(task); task_update_spec_tif(task); break; @@ -1283,10 +1301,10 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return -EPERM; - if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + + if (!is_spec_ib_user_controlled()) return 0; + task_set_spec_ib_disable(task); if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); @@ -1351,20 +1369,17 @@ static int ib_prctl_get(struct task_struct *task) if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return PR_SPEC_ENABLE; - else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) - return PR_SPEC_DISABLE; - else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || - spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || - spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || - spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) { + else if (is_spec_ib_user_controlled()) { if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - } else + } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + return PR_SPEC_DISABLE; + else return PR_SPEC_NOT_AFFECTED; } -- cgit v1.2.3 From 1905cac9d621a10358bc2750f8b25b64df439a21 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 23 Oct 2020 10:41:01 -0400 Subject: NFSD: NFSv3 PATHCONF Reply is improperly formed Commit cc028a10a48c ("NFSD: Hoist status code encoding into XDR encoder functions") missed a spot. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3xdr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9c23b6acf234..2277f83da250 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1114,6 +1114,7 @@ nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_pathconfres *resp = rqstp->rq_resp; + *p++ = resp->status; *p++ = xdr_zero; /* no post_op_attr */ if (resp->status == 0) { -- cgit v1.2.3 From d321ff589c16d8c2207485a6d7fbdb14e873d46e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 23 Oct 2020 10:41:07 -0400 Subject: SUNRPC: Fix general protection fault in trace_rpc_xdr_overflow() The TP_fast_assign() section is careful enough not to dereference xdr->rqst if it's NULL. The TP_STRUCT__entry section is not. Fixes: 5582863f450c ("SUNRPC: Add XDR overflow trace event") Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/trace/events/sunrpc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index f45b3c01370c..2477014e3fa6 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -655,10 +655,10 @@ TRACE_EVENT(rpc_xdr_overflow, __field(size_t, tail_len) __field(unsigned int, page_len) __field(unsigned int, len) - __string(progname, - xdr->rqst->rq_task->tk_client->cl_program->name) - __string(procedure, - xdr->rqst->rq_task->tk_msg.rpc_proc->p_name) + __string(progname, xdr->rqst ? + xdr->rqst->rq_task->tk_client->cl_program->name : "unknown") + __string(procedure, xdr->rqst ? + xdr->rqst->rq_task->tk_msg.rpc_proc->p_name : "unknown") ), TP_fast_assign( -- cgit v1.2.3 From 66d60e3ad1e44d42d940767f62bf265f107fb628 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 23 Oct 2020 10:41:12 -0400 Subject: NFSD: MKNOD should return NFSERR_BADTYPE instead of NFSERR_INVAL A late paragraph of RFC 1813 Section 3.3.11 states: | ... if the server does not support the target type or the | target type is illegal, the error, NFS3ERR_BADTYPE, should | be returned. Note that NF3REG, NF3DIR, and NF3LNK are | illegal types for MKNOD. The Linux NFS server incorrectly returns NFSERR_INVAL in these cases. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 14468613d150..a633044b0dc1 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -316,10 +316,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); - if (argp->ftype == 0 || argp->ftype >= NF3BAD) { - resp->status = nfserr_inval; - goto out; - } if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) { rdev = MKDEV(argp->major, argp->minor); if (MAJOR(rdev) != argp->major || @@ -328,7 +324,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) goto out; } } else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) { - resp->status = nfserr_inval; + resp->status = nfserr_badtype; goto out; } -- cgit v1.2.3 From 36e1e5ba90fb3fba6888fae26e4dfc28bf70aaf1 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Thu, 29 Oct 2020 15:07:15 -0400 Subject: NFSD: Fix use-after-free warning when doing inter-server copy The source file nfsd_file is not constructed the same as other nfsd_file's via nfsd_file_alloc. nfsd_file_put should not be called to free the object; nfsd_file_put is not the inverse of kzalloc, instead kfree is called by nfsd4_do_async_copy when done. Fixes: ce0887ac96d3 ("NFSD add nfs4 inter ssc to nfsd4_copy") Signed-off-by: Dai Ngo Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ad2fa1a8e7ad..9c43cad7e408 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1299,7 +1299,7 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src, struct nfsd_file *dst) { nfs42_ssc_close(src->nf_file); - nfsd_file_put(src); + /* 'src' is freed by nfsd4_do_async_copy */ nfsd_file_put(dst); mntput(ss_mnt); } -- cgit v1.2.3 From 49a361327332c9221438397059067f9b205f690d Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Thu, 29 Oct 2020 15:07:16 -0400 Subject: NFSD: fix missing refcount in nfsd4_copy by nfsd4_do_async_copy Need to initialize nfsd4_copy's refcount to 1 to avoid use-after-free warning when nfs4_put_copy is called from nfsd4_cb_offload_release. Fixes: ce0887ac96d3 ("NFSD add nfs4 inter ssc to nfsd4_copy") Signed-off-by: Dai Ngo Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 9c43cad7e408..e83b21778816 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1486,6 +1486,7 @@ do_callback: cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); if (!cb_copy) goto out; + refcount_set(&cb_copy->refcount, 1); memcpy(&cb_copy->cp_res, ©->cp_res, sizeof(copy->cp_res)); cb_copy->cp_clp = copy->cp_clp; cb_copy->nfserr = copy->nfserr; -- cgit v1.2.3 From 9a472ef7a3690ac0b77ebfb04c88fa795de2adea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 5 Nov 2020 22:31:37 +0000 Subject: io_uring: fix link lookup racing with link timeout We can't just go over linked requests because it may race with linked timeouts. Take ctx->completion_lock in that case. Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 29f1417690d5..8018c7076b25 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8470,7 +8470,21 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx, static bool io_cancel_link_cb(struct io_wq_work *work, void *data) { - return io_match_link(container_of(work, struct io_kiocb, work), data); + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + bool ret; + + if (req->flags & REQ_F_LINK_TIMEOUT) { + unsigned long flags; + struct io_ring_ctx *ctx = req->ctx; + + /* protect against races with linked timeouts */ + spin_lock_irqsave(&ctx->completion_lock, flags); + ret = io_match_link(req, data); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + ret = io_match_link(req, data); + } + return ret; } static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) -- cgit v1.2.3 From 79605f1394261995c2b955c906a5a20fb27cdc84 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 22 Oct 2020 16:30:12 -0400 Subject: riscv: Set text_offset correctly for M-Mode M-Mode Linux is loaded at the start of RAM, not 2MB later. Perhaps this should be calculated based on PAGE_OFFSET somehow? Even better would be to deprecate text_offset and instead introduce something absolute. Signed-off-by: Sean Anderson Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/head.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 11e2a4fe66e0..7e849797c9c3 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -35,12 +35,17 @@ ENTRY(_start) .word 0 #endif .balign 8 +#ifdef CONFIG_RISCV_M_MODE + /* Image load offset (0MB) from start of RAM for M-mode */ + .dword 0 +#else #if __riscv_xlen == 64 /* Image load offset(2MB) from start of RAM */ .dword 0x200000 #else /* Image load offset(4MB) from start of RAM */ .dword 0x400000 +#endif #endif /* Effective size of kernel image */ .dword _end - _start -- cgit v1.2.3 From f9b7ff0d7f7a466a920424246e7ddc2b84c87e52 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 5 Nov 2020 11:52:30 +0000 Subject: tools/bpftool: Fix attaching flow dissector My earlier patch to reject non-zero arguments to flow dissector attach broke attaching via bpftool. Instead of 0 it uses -1 for target_fd. Fix this by passing a zero argument when attaching the flow dissector. Fixes: 1b514239e859 ("bpf: flow_dissector: Check value of unused flags to BPF_PROG_ATTACH") Reported-by: Jiri Benc Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105115230.296657-1-lmb@cloudflare.com --- tools/bpf/bpftool/prog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index d942c1e3372c..acdb2c245f0a 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -940,7 +940,7 @@ static int parse_attach_detach_args(int argc, char **argv, int *progfd, } if (*attach_type == BPF_FLOW_DISSECTOR) { - *mapfd = -1; + *mapfd = 0; return 0; } -- cgit v1.2.3 From 7c0afcad7507636529e6a5a2a5eef5482619a449 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 5 Nov 2020 11:51:09 -0800 Subject: bpf: BPF_PRELOAD depends on BPF_SYSCALL Fix build error when BPF_SYSCALL is not set/enabled but BPF_PRELOAD is by making BPF_PRELOAD depend on BPF_SYSCALL. ERROR: modpost: "bpf_preload_ops" [kernel/bpf/preload/bpf_preload.ko] undefined! Reported-by: kernel test robot Reported-by: Randy Dunlap Signed-off-by: Randy Dunlap Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201105195109.26232-1-rdunlap@infradead.org --- kernel/bpf/preload/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index ace49111d3a3..26bced262473 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -6,6 +6,7 @@ config USERMODE_DRIVER menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF + depends on BPF_SYSCALL # The dependency on !COMPILE_TEST prevents it from being enabled # in allmodconfig or allyesconfig configurations depends on !COMPILE_TEST -- cgit v1.2.3 From 3fb4a8fa28b740709bdd3229b80279957f4d37ed Mon Sep 17 00:00:00 2001 From: Scott Cheloha Date: Thu, 5 Nov 2020 16:30:40 -0600 Subject: powerpc/numa: Fix build when CONFIG_NUMA=n Add a non-NUMA definition for of_drconf_to_nid_single() to topology.h so we have one even if powerpc/mm/numa.c is not compiled. On a non-NUMA kernel the appropriate node id is always first_online_node. Fixes: 72cdd117c449 ("pseries/hotplug-memory: hot-add: skip redundant LMB lookup") Reported-by: kernel test robot Signed-off-by: Scott Cheloha Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201105223040.3612663-1-cheloha@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 8728590f514a..3beeb030cd78 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -6,6 +6,7 @@ struct device; struct device_node; +struct drmem_lmb; #ifdef CONFIG_NUMA @@ -61,6 +62,9 @@ static inline int early_cpu_to_node(int cpu) */ return (nid < 0) ? 0 : nid; } + +int of_drconf_to_nid_single(struct drmem_lmb *lmb); + #else static inline int early_cpu_to_node(int cpu) { return 0; } @@ -84,10 +88,12 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) return 0; } -#endif /* CONFIG_NUMA */ +static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) +{ + return first_online_node; +} -struct drmem_lmb; -int of_drconf_to_nid_single(struct drmem_lmb *lmb); +#endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) extern int find_and_online_cpu_nid(int cpu); -- cgit v1.2.3 From d3bec0138bfbe58606fc1d6f57a4cdc1a20218db Mon Sep 17 00:00:00 2001 From: David Verbeiren Date: Wed, 4 Nov 2020 12:23:32 +0100 Subject: bpf: Zero-fill re-used per-cpu map element Zero-fill element values for all other cpus than current, just as when not using prealloc. This is the only way the bpf program can ensure known initial values for all cpus ('onallcpus' cannot be set when coming from the bpf program). The scenario is: bpf program inserts some elements in a per-cpu map, then deletes some (or userspace does). When later adding new elements using bpf_map_update_elem(), the bpf program can only set the value of the new elements for the current cpu. When prealloc is enabled, previously deleted elements are re-used. Without the fix, values for other cpus remain whatever they were when the re-used entry was previously freed. A selftest is added to validate correct operation in above scenario as well as in case of LRU per-cpu map element re-use. Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Signed-off-by: David Verbeiren Signed-off-by: Alexei Starovoitov Acked-by: Matthieu Baerts Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201104112332.15191-1-david.verbeiren@tessares.net --- kernel/bpf/hashtab.c | 30 ++- tools/testing/selftests/bpf/prog_tests/map_init.c | 214 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/test_map_init.c | 33 ++++ 3 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_init.c create mode 100644 tools/testing/selftests/bpf/progs/test_map_init.c diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1815e97d4c9c..1fccba6e88c4 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -821,6 +821,32 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + /* When using prealloc and not setting the initial value on all cpus, + * zero-fill element values for other cpus (just as what happens when + * not using prealloc). Otherwise, bpf program has no way to ensure + * known initial values for cpus other than current one + * (onallcpus=false always when coming from bpf prog). + */ + if (htab_is_prealloc(htab) && !onallcpus) { + u32 size = round_up(htab->map.value_size, 8); + int current_cpu = raw_smp_processor_id(); + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == current_cpu) + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, + size); + else + memset(per_cpu_ptr(pptr, cpu), 0, size); + } + } else { + pcpu_copy_value(htab, pptr, value, onallcpus); + } +} + static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && @@ -891,7 +917,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -1183,7 +1209,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { - pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c new file mode 100644 index 000000000000..14a31109dd0e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_init.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Tessares SA */ + +#include +#include "test_map_init.skel.h" + +#define TEST_VALUE 0x1234 +#define FILL_VALUE 0xdeadbeef + +static int nr_cpus; +static int duration; + +typedef unsigned long long map_key_t; +typedef unsigned long long map_value_t; +typedef struct { + map_value_t v; /* padding */ +} __bpf_percpu_val_align pcpu_map_value_t; + + +static int map_populate(int map_fd, int num) +{ + pcpu_map_value_t value[nr_cpus]; + int i, err; + map_key_t key; + + for (i = 0; i < nr_cpus; i++) + bpf_percpu(value, i) = FILL_VALUE; + + for (key = 1; key <= num; key++) { + err = bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + return -1; + } + + return 0; +} + +static struct test_map_init *setup(enum bpf_map_type map_type, int map_sz, + int *map_fd, int populate) +{ + struct test_map_init *skel; + int err; + + skel = test_map_init__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return NULL; + + err = bpf_map__set_type(skel->maps.hashmap1, map_type); + if (!ASSERT_OK(err, "bpf_map__set_type")) + goto error; + + err = bpf_map__set_max_entries(skel->maps.hashmap1, map_sz); + if (!ASSERT_OK(err, "bpf_map__set_max_entries")) + goto error; + + err = test_map_init__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto error; + + *map_fd = bpf_map__fd(skel->maps.hashmap1); + if (CHECK(*map_fd < 0, "bpf_map__fd", "failed\n")) + goto error; + + err = map_populate(*map_fd, populate); + if (!ASSERT_OK(err, "map_populate")) + goto error_map; + + return skel; + +error_map: + close(*map_fd); +error: + test_map_init__destroy(skel); + return NULL; +} + +/* executes bpf program that updates map with key, value */ +static int prog_run_insert_elem(struct test_map_init *skel, map_key_t key, + map_value_t value) +{ + struct test_map_init__bss *bss; + + bss = skel->bss; + + bss->inKey = key; + bss->inValue = value; + bss->inPid = getpid(); + + if (!ASSERT_OK(test_map_init__attach(skel), "skel_attach")) + return -1; + + /* Let tracepoint trigger */ + syscall(__NR_getpgid); + + test_map_init__detach(skel); + + return 0; +} + +static int check_values_one_cpu(pcpu_map_value_t *value, map_value_t expected) +{ + int i, nzCnt = 0; + map_value_t val; + + for (i = 0; i < nr_cpus; i++) { + val = bpf_percpu(value, i); + if (val) { + if (CHECK(val != expected, "map value", + "unexpected for cpu %d: 0x%llx\n", i, val)) + return -1; + nzCnt++; + } + } + + if (CHECK(nzCnt != 1, "map value", "set for %d CPUs instead of 1!\n", + nzCnt)) + return -1; + + return 0; +} + +/* Add key=1 elem with values set for all CPUs + * Delete elem key=1 + * Run bpf prog that inserts new key=1 elem with value=0x1234 + * (bpf prog can only set value for current CPU) + * Lookup Key=1 and check value is as expected for all CPUs: + * value set by bpf prog for one CPU, 0 for all others + */ +static void test_pcpu_map_init(void) +{ + pcpu_map_value_t value[nr_cpus]; + struct test_map_init *skel; + int map_fd, err; + map_key_t key; + + /* max 1 elem in map so insertion is forced to reuse freed entry */ + skel = setup(BPF_MAP_TYPE_PERCPU_HASH, 1, &map_fd, 1); + if (!ASSERT_OK_PTR(skel, "prog_setup")) + return; + + /* delete element so the entry can be re-used*/ + key = 1; + err = bpf_map_delete_elem(map_fd, &key); + if (!ASSERT_OK(err, "bpf_map_delete_elem")) + goto cleanup; + + /* run bpf prog that inserts new elem, re-using the slot just freed */ + err = prog_run_insert_elem(skel, key, TEST_VALUE); + if (!ASSERT_OK(err, "prog_run_insert_elem")) + goto cleanup; + + /* check that key=1 was re-created by bpf prog */ + err = bpf_map_lookup_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto cleanup; + + /* and has expected values */ + check_values_one_cpu(value, TEST_VALUE); + +cleanup: + test_map_init__destroy(skel); +} + +/* Add key=1 and key=2 elems with values set for all CPUs + * Run bpf prog that inserts new key=3 elem + * (only for current cpu; other cpus should have initial value = 0) + * Lookup Key=1 and check value is as expected for all CPUs + */ +static void test_pcpu_lru_map_init(void) +{ + pcpu_map_value_t value[nr_cpus]; + struct test_map_init *skel; + int map_fd, err; + map_key_t key; + + /* Set up LRU map with 2 elements, values filled for all CPUs. + * With these 2 elements, the LRU map is full + */ + skel = setup(BPF_MAP_TYPE_LRU_PERCPU_HASH, 2, &map_fd, 2); + if (!ASSERT_OK_PTR(skel, "prog_setup")) + return; + + /* run bpf prog that inserts new key=3 element, re-using LRU slot */ + key = 3; + err = prog_run_insert_elem(skel, key, TEST_VALUE); + if (!ASSERT_OK(err, "prog_run_insert_elem")) + goto cleanup; + + /* check that key=3 replaced one of earlier elements */ + err = bpf_map_lookup_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto cleanup; + + /* and has expected values */ + check_values_one_cpu(value, TEST_VALUE); + +cleanup: + test_map_init__destroy(skel); +} + +void test_map_init(void) +{ + nr_cpus = bpf_num_possible_cpus(); + if (nr_cpus <= 1) { + printf("%s:SKIP: >1 cpu needed for this test\n", __func__); + test__skip(); + return; + } + + if (test__start_subtest("pcpu_map_init")) + test_pcpu_map_init(); + if (test__start_subtest("pcpu_lru_map_init")) + test_pcpu_lru_map_init(); +} diff --git a/tools/testing/selftests/bpf/progs/test_map_init.c b/tools/testing/selftests/bpf/progs/test_map_init.c new file mode 100644 index 000000000000..c89d28ead673 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_map_init.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Tessares SA */ + +#include "vmlinux.h" +#include + +__u64 inKey = 0; +__u64 inValue = 0; +__u32 inPid = 0; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 2); + __type(key, __u64); + __type(value, __u64); +} hashmap1 SEC(".maps"); + + +SEC("tp/syscalls/sys_enter_getpgid") +int sysenter_getpgid(const void *ctx) +{ + /* Just do it for once, when called from our own test prog. This + * ensures the map value is only updated for a single CPU. + */ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + + if (cur_pid == inPid) + bpf_map_update_elem(&hashmap1, &inKey, &inValue, BPF_NOEXIST); + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From bcacf5f6f239a9e60287680514f392748cb4ec39 Mon Sep 17 00:00:00 2001 From: Liu Shaohua Date: Mon, 26 Oct 2020 20:26:54 +0800 Subject: riscv: fix pfn_to_virt err in do_page_fault(). The argument to pfn_to_virt() should be pfn not the value of CSR_SATP. Reviewed-by: Palmer Dabbelt Reviewed-by: Anup Patel Signed-off-by: liush Reviewed-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 1359e21c0c62..3c8b9e433c67 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -86,6 +86,7 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a pmd_t *pmd, *pmd_k; pte_t *pte_k; int index; + unsigned long pfn; /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) @@ -100,7 +101,8 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a * of a task switch. */ index = pgd_index(addr); - pgd = (pgd_t *)pfn_to_virt(csr_read(CSR_SATP)) + index; + pfn = csr_read(CSR_SATP) & SATP_PPN; + pgd = (pgd_t *)pfn_to_virt(pfn) + index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) { -- cgit v1.2.3 From 635e3f3e47f24b2506bc9daf91d70ddf3cd024a9 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Mon, 2 Nov 2020 15:30:52 +0800 Subject: riscv: uaccess: fix __put_kernel_nofault() The copy_from_kernel_nofault() is broken on riscv because the 'dst' and 'src' are mistakenly reversed in __put_kernel_nofault() macro. copy_to_kernel_nofault: ... 0xffffffe0003159b8 <+30>: sd a4,0(a1) # a1 aka 'src' Fixes: d464118cdc ("riscv: implement __get_kernel_nofault and __put_user_nofault") Signed-off-by: Changbin Du Reviewed-by: Christoph Hellwig Reviewed-by: Anup Patel Tested-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index c47e6b35c551..824b2c9da75b 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -476,7 +476,7 @@ do { \ do { \ long __kr_err; \ \ - __put_user_nocheck(*((type *)(dst)), (type *)(src), __kr_err); \ + __put_user_nocheck(*((type *)(src)), (type *)(dst), __kr_err); \ if (unlikely(__kr_err)) \ goto err_label; \ } while (0) -- cgit v1.2.3 From 1074dd44c5ba377f90e2d0d99a784f73dbea6ff7 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 4 Nov 2020 12:07:13 +0530 Subject: RISC-V: Use non-PGD mappings for early DTB access Currently, we use PGD mappings for early DTB mapping in early_pgd but this breaks Linux kernel on SiFive Unleashed because on SiFive Unleashed PMP checks don't work correctly for PGD mappings. To fix early DTB mappings on SiFive Unleashed, we use non-PGD mappings (i.e. PMD) for early DTB access. Fixes: 8f3a2b4a96dc ("RISC-V: Move DT mapping outof fixmap") Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 42c14eb6bc43..8e577f14f120 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -293,6 +293,7 @@ pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss; #define NUM_EARLY_PMDS (1UL + MAX_EARLY_MAPPING_SIZE / PGDIR_SIZE) #endif pmd_t early_pmd[PTRS_PER_PMD * NUM_EARLY_PMDS] __initdata __aligned(PAGE_SIZE); +pmd_t early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) { @@ -490,6 +491,18 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) load_pa + (va - PAGE_OFFSET), map_size, PAGE_KERNEL_EXEC); +#ifndef __PAGETABLE_PMD_FOLDED + /* Setup early PMD for DTB */ + create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, + (uintptr_t)early_dtb_pmd, PGDIR_SIZE, PAGE_TABLE); + /* Create two consecutive PMD mappings for FDT early scan */ + pa = dtb_pa & ~(PMD_SIZE - 1); + create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, + pa, PMD_SIZE, PAGE_KERNEL); + create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE, + pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); + dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); +#else /* Create two consecutive PGD mappings for FDT early scan */ pa = dtb_pa & ~(PGDIR_SIZE - 1); create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, @@ -497,6 +510,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA + PGDIR_SIZE, pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL); dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); +#endif dtb_early_pa = dtb_pa; /* -- cgit v1.2.3 From c2c81bb2f69138f902e1a58d3bef6ad97fb8a92c Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 23 Oct 2020 21:50:47 -0700 Subject: RISC-V: Fix the VDSO symbol generaton for binutils-2.35+ We were relying on GNU ld's ability to re-link executable files in order to extract our VDSO symbols. This behavior was deemed a bug as of binutils-2.35 (specifically the binutils-gdb commit a87e1817a4 ("Have the linker fail if any attempt to link in an executable is made."), but as that has been backported to at least Debian's binutils-2.34 in may manifest in other places. The previous version of this was a bit of a mess: we were linking a static executable version of the VDSO, containing only a subset of the input symbols, which we then linked into the kernel. This worked, but certainly wasn't a supported path through the toolchain. Instead this new version parses the textual output of nm to produce a symbol table. Both rely on near-zero addresses being linkable, but as we rely on weak undefined symbols being linkable elsewhere I don't view this as a major issue. Fixes: e2c0cdfba7f6 ("RISC-V: User-facing API") Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vdso/.gitignore | 1 + arch/riscv/kernel/vdso/Makefile | 18 +++++++++--------- arch/riscv/kernel/vdso/so2s.sh | 6 ++++++ 3 files changed, 16 insertions(+), 9 deletions(-) create mode 100755 arch/riscv/kernel/vdso/so2s.sh diff --git a/arch/riscv/kernel/vdso/.gitignore b/arch/riscv/kernel/vdso/.gitignore index 11ebee9e4c1d..3a19def868ec 100644 --- a/arch/riscv/kernel/vdso/.gitignore +++ b/arch/riscv/kernel/vdso/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only vdso.lds *.tmp +vdso-syms.S diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 7d6a94d45ec9..cb8f9e4cfcbf 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -43,19 +43,14 @@ $(obj)/vdso.o: $(obj)/vdso.so SYSCFLAGS_vdso.so.dbg = $(c_flags) $(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold) +SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \ + -Wl,--build-id -Wl,--hash-style=both # We also create a special relocatable object that should mirror the symbol # table and layout of the linked DSO. With ld --just-symbols we can then # refer to these symbols in the kernel code rather than hand-coded addresses. - -SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \ - -Wl,--build-id=sha1 -Wl,--hash-style=both -$(obj)/vdso-dummy.o: $(src)/vdso.lds $(obj)/rt_sigreturn.o FORCE - $(call if_changed,vdsold) - -LDFLAGS_vdso-syms.o := -r --just-symbols -$(obj)/vdso-syms.o: $(obj)/vdso-dummy.o FORCE - $(call if_changed,ld) +$(obj)/vdso-syms.S: $(obj)/vdso.so FORCE + $(call if_changed,so2s) # strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -73,6 +68,11 @@ quiet_cmd_vdsold = VDSOLD $@ $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \ rm $@.tmp +# Extracts symbol offsets from the VDSO, converting them into an assembly file +# that contains the same symbols at the same offsets. +quiet_cmd_so2s = SO2S $@ + cmd_so2s = $(NM) -D $< | $(srctree)/$(src)/so2s.sh > $@ + # install commands for the unstripped file quiet_cmd_vdso_install = INSTALL $@ cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ diff --git a/arch/riscv/kernel/vdso/so2s.sh b/arch/riscv/kernel/vdso/so2s.sh new file mode 100755 index 000000000000..e64cb6d9440e --- /dev/null +++ b/arch/riscv/kernel/vdso/so2s.sh @@ -0,0 +1,6 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# Copyright 2020 Palmer Dabbelt + +sed 's!\([0-9a-f]*\) T \([a-z0-9_]*\)\(@@LINUX_4.15\)*!.global \2\n.set \2,0x\1!' \ +| grep '^\.' -- cgit v1.2.3 From 20914919ad31849ee2b9cfe0428f4a20335c9e2a Mon Sep 17 00:00:00 2001 From: Macpaul Lin Date: Fri, 6 Nov 2020 13:54:29 +0800 Subject: usb: mtu3: fix panic in mtu3_gadget_stop() This patch fixes a possible issue when mtu3_gadget_stop() already assigned NULL to mtu->gadget_driver during mtu_gadget_disconnect(). [] notifier_call_chain+0xa4/0x128 [] __atomic_notifier_call_chain+0x84/0x138 [] notify_die+0xb0/0x120 [] die+0x1f8/0x5d0 [] __do_kernel_fault+0x19c/0x280 [] do_bad_area+0x44/0x140 [] do_translation_fault+0x4c/0x90 [] do_mem_abort+0xb8/0x258 [] el1_da+0x24/0x3c [] mtu3_gadget_disconnect+0xac/0x128 [] mtu3_irq+0x34c/0xc18 [] __handle_irq_event_percpu+0x2ac/0xcd0 [] handle_irq_event_percpu+0x80/0x138 [] handle_irq_event+0xac/0x148 [] handle_fasteoi_irq+0x234/0x568 [] generic_handle_irq+0x48/0x68 [] __handle_domain_irq+0x264/0x1740 [] gic_handle_irq+0x14c/0x250 [] el1_irq+0xec/0x194 [] dma_pool_alloc+0x6e4/0xae0 [] cmdq_mbox_pool_alloc_impl+0xb0/0x238 [] cmdq_pkt_alloc_buf+0x2dc/0x7c0 [] cmdq_pkt_add_cmd_buffer+0x178/0x270 [] cmdq_pkt_perf_begin+0x108/0x148 [] cmdq_pkt_create+0x178/0x1f0 [] mtk_crtc_config_default_path+0x328/0x7a0 [] mtk_drm_idlemgr_kick+0xa6c/0x1460 [] mtk_drm_crtc_atomic_begin+0x1a4/0x1a68 [] drm_atomic_helper_commit_planes+0x154/0x878 [] mtk_atomic_complete.isra.16+0xe80/0x19c8 [] mtk_atomic_commit+0x258/0x898 [] drm_atomic_commit+0xcc/0x108 [] drm_mode_atomic_ioctl+0x1c20/0x2580 [] drm_ioctl_kernel+0x118/0x1b0 [] drm_ioctl+0x5c0/0x920 [] do_vfs_ioctl+0x188/0x1820 [] SyS_ioctl+0x8c/0xa0 Fixes: df2069acb005 ("usb: Add MediaTek USB3 DRD driver") Signed-off-by: Macpaul Lin Acked-by: Chunfeng Yun Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1604642069-20961-1-git-send-email-macpaul.lin@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/mtu3/mtu3_gadget.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/mtu3/mtu3_gadget.c b/drivers/usb/mtu3/mtu3_gadget.c index 1de5c9a1d20a..38f17d66d5bc 100644 --- a/drivers/usb/mtu3/mtu3_gadget.c +++ b/drivers/usb/mtu3/mtu3_gadget.c @@ -564,6 +564,7 @@ static int mtu3_gadget_stop(struct usb_gadget *g) spin_unlock_irqrestore(&mtu->lock, flags); + synchronize_irq(mtu->irq); return 0; } -- cgit v1.2.3 From 00bd6bca3fb1e98190a24eda2583062803c9e8b5 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Mon, 2 Nov 2020 10:26:50 +0800 Subject: USB: apple-mfi-fastcharge: fix reference leak in apple_mfi_fc_set_property pm_runtime_get_sync() will increment pm usage at first and it will resume the device later. If runtime of the device has error or device is in inaccessible state(or other error state), resume operation will fail. If we do not call put operation to decrease the reference, the result is that this device cannot enter the idle state and always stay busy or other non-idle state. Fixes: 249fa8217b846 ("USB: Add driver to control USB fast charge for iOS devices") Signed-off-by: Zhang Qilong Link: https://lore.kernel.org/r/20201102022650.67115-1-zhangqilong3@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/apple-mfi-fastcharge.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/misc/apple-mfi-fastcharge.c b/drivers/usb/misc/apple-mfi-fastcharge.c index 579d8c84de42..9de0171b5177 100644 --- a/drivers/usb/misc/apple-mfi-fastcharge.c +++ b/drivers/usb/misc/apple-mfi-fastcharge.c @@ -120,8 +120,10 @@ static int apple_mfi_fc_set_property(struct power_supply *psy, dev_dbg(&mfi->udev->dev, "prop: %d\n", psp); ret = pm_runtime_get_sync(&mfi->udev->dev); - if (ret < 0) + if (ret < 0) { + pm_runtime_put_noidle(&mfi->udev->dev); return ret; + } switch (psp) { case POWER_SUPPLY_PROP_CHARGE_TYPE: -- cgit v1.2.3 From a49cc1fe9d64a2dc4e19b599204f403e5d25f44b Mon Sep 17 00:00:00 2001 From: Ulrich Hecht Date: Mon, 28 Sep 2020 17:59:50 +0200 Subject: i2c: sh_mobile: implement atomic transfers Implements atomic transfers to fix reboot/shutdown on r8a7790 Lager and similar boards. Signed-off-by: Ulrich Hecht Tested-by: Wolfram Sang Tested-by: Geert Uytterhoeven [wsa: some whitespace fixing] Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-sh_mobile.c | 86 +++++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 20 deletions(-) diff --git a/drivers/i2c/busses/i2c-sh_mobile.c b/drivers/i2c/busses/i2c-sh_mobile.c index cab725559999..bdd60770779a 100644 --- a/drivers/i2c/busses/i2c-sh_mobile.c +++ b/drivers/i2c/busses/i2c-sh_mobile.c @@ -129,6 +129,7 @@ struct sh_mobile_i2c_data { int sr; bool send_stop; bool stop_after_dma; + bool atomic_xfer; struct resource *res; struct dma_chan *dma_tx; @@ -330,13 +331,15 @@ static unsigned char i2c_op(struct sh_mobile_i2c_data *pd, enum sh_mobile_i2c_op ret = iic_rd(pd, ICDR); break; case OP_RX_STOP: /* enable DTE interrupt, issue stop */ - iic_wr(pd, ICIC, - ICIC_DTEE | ICIC_WAITE | ICIC_ALE | ICIC_TACKE); + if (!pd->atomic_xfer) + iic_wr(pd, ICIC, + ICIC_DTEE | ICIC_WAITE | ICIC_ALE | ICIC_TACKE); iic_wr(pd, ICCR, ICCR_ICE | ICCR_RACK); break; case OP_RX_STOP_DATA: /* enable DTE interrupt, read data, issue stop */ - iic_wr(pd, ICIC, - ICIC_DTEE | ICIC_WAITE | ICIC_ALE | ICIC_TACKE); + if (!pd->atomic_xfer) + iic_wr(pd, ICIC, + ICIC_DTEE | ICIC_WAITE | ICIC_ALE | ICIC_TACKE); ret = iic_rd(pd, ICDR); iic_wr(pd, ICCR, ICCR_ICE | ICCR_RACK); break; @@ -429,7 +432,8 @@ static irqreturn_t sh_mobile_i2c_isr(int irq, void *dev_id) if (wakeup) { pd->sr |= SW_DONE; - wake_up(&pd->wait); + if (!pd->atomic_xfer) + wake_up(&pd->wait); } /* defeat write posting to avoid spurious WAIT interrupts */ @@ -581,6 +585,9 @@ static void start_ch(struct sh_mobile_i2c_data *pd, struct i2c_msg *usr_msg, pd->pos = -1; pd->sr = 0; + if (pd->atomic_xfer) + return; + pd->dma_buf = i2c_get_dma_safe_msg_buf(pd->msg, 8); if (pd->dma_buf) sh_mobile_i2c_xfer_dma(pd); @@ -637,15 +644,13 @@ static int poll_busy(struct sh_mobile_i2c_data *pd) return i ? 0 : -ETIMEDOUT; } -static int sh_mobile_i2c_xfer(struct i2c_adapter *adapter, - struct i2c_msg *msgs, - int num) +static int sh_mobile_xfer(struct sh_mobile_i2c_data *pd, + struct i2c_msg *msgs, int num) { - struct sh_mobile_i2c_data *pd = i2c_get_adapdata(adapter); struct i2c_msg *msg; int err = 0; int i; - long timeout; + long time_left; /* Wake up device and enable clock */ pm_runtime_get_sync(pd->dev); @@ -662,15 +667,35 @@ static int sh_mobile_i2c_xfer(struct i2c_adapter *adapter, if (do_start) i2c_op(pd, OP_START); - /* The interrupt handler takes care of the rest... */ - timeout = wait_event_timeout(pd->wait, - pd->sr & (ICSR_TACK | SW_DONE), - adapter->timeout); - - /* 'stop_after_dma' tells if DMA transfer was complete */ - i2c_put_dma_safe_msg_buf(pd->dma_buf, pd->msg, pd->stop_after_dma); + if (pd->atomic_xfer) { + unsigned long j = jiffies + pd->adap.timeout; + + time_left = time_before_eq(jiffies, j); + while (time_left && + !(pd->sr & (ICSR_TACK | SW_DONE))) { + unsigned char sr = iic_rd(pd, ICSR); + + if (sr & (ICSR_AL | ICSR_TACK | + ICSR_WAIT | ICSR_DTE)) { + sh_mobile_i2c_isr(0, pd); + udelay(150); + } else { + cpu_relax(); + } + time_left = time_before_eq(jiffies, j); + } + } else { + /* The interrupt handler takes care of the rest... */ + time_left = wait_event_timeout(pd->wait, + pd->sr & (ICSR_TACK | SW_DONE), + pd->adap.timeout); + + /* 'stop_after_dma' tells if DMA xfer was complete */ + i2c_put_dma_safe_msg_buf(pd->dma_buf, pd->msg, + pd->stop_after_dma); + } - if (!timeout) { + if (!time_left) { dev_err(pd->dev, "Transfer request timed out\n"); if (pd->dma_direction != DMA_NONE) sh_mobile_i2c_cleanup_dma(pd); @@ -696,14 +721,35 @@ static int sh_mobile_i2c_xfer(struct i2c_adapter *adapter, return err ?: num; } +static int sh_mobile_i2c_xfer(struct i2c_adapter *adapter, + struct i2c_msg *msgs, + int num) +{ + struct sh_mobile_i2c_data *pd = i2c_get_adapdata(adapter); + + pd->atomic_xfer = false; + return sh_mobile_xfer(pd, msgs, num); +} + +static int sh_mobile_i2c_xfer_atomic(struct i2c_adapter *adapter, + struct i2c_msg *msgs, + int num) +{ + struct sh_mobile_i2c_data *pd = i2c_get_adapdata(adapter); + + pd->atomic_xfer = true; + return sh_mobile_xfer(pd, msgs, num); +} + static u32 sh_mobile_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL | I2C_FUNC_PROTOCOL_MANGLING; } static const struct i2c_algorithm sh_mobile_i2c_algorithm = { - .functionality = sh_mobile_i2c_func, - .master_xfer = sh_mobile_i2c_xfer, + .functionality = sh_mobile_i2c_func, + .master_xfer = sh_mobile_i2c_xfer, + .master_xfer_atomic = sh_mobile_i2c_xfer_atomic, }; static const struct i2c_adapter_quirks sh_mobile_i2c_quirks = { -- cgit v1.2.3 From ed01ddc618fc356bbc7b702823c87ed3ada198a6 Mon Sep 17 00:00:00 2001 From: Khalil Blaiech Date: Tue, 3 Nov 2020 14:54:38 -0500 Subject: i2c: mlxbf: Add CONFIG_ACPI to guard ACPI function call The build fails with "implicit declaration of function 'acpi_device_uid'" error. Thus, protect ACPI function calls from being called when CONFIG_ACPI is disabled. Fixes: b5b5b32081cd206b ("i2c: mlxbf: I2C SMBus driver for Mellanox BlueField SoC") Reported-by: kernel test robot Reviewed-by: Leon Romanovsky Reviewed-by: Vadim Pasternak Signed-off-by: Khalil Blaiech Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mlxbf.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/i2c/busses/i2c-mlxbf.c b/drivers/i2c/busses/i2c-mlxbf.c index ee59e0da082d..e18f595b37a7 100644 --- a/drivers/i2c/busses/i2c-mlxbf.c +++ b/drivers/i2c/busses/i2c-mlxbf.c @@ -2264,6 +2264,7 @@ static const struct of_device_id mlxbf_i2c_dt_ids[] = { MODULE_DEVICE_TABLE(of, mlxbf_i2c_dt_ids); +#ifdef CONFIG_ACPI static const struct acpi_device_id mlxbf_i2c_acpi_ids[] = { { "MLNXBF03", (kernel_ulong_t)&mlxbf_i2c_chip[MLXBF_I2C_CHIP_TYPE_1] }, { "MLNXBF23", (kernel_ulong_t)&mlxbf_i2c_chip[MLXBF_I2C_CHIP_TYPE_2] }, @@ -2305,6 +2306,12 @@ static int mlxbf_i2c_acpi_probe(struct device *dev, struct mlxbf_i2c_priv *priv) return ret; } +#else +static int mlxbf_i2c_acpi_probe(struct device *dev, struct mlxbf_i2c_priv *priv) +{ + return -ENOENT; +} +#endif /* CONFIG_ACPI */ static int mlxbf_i2c_of_probe(struct device *dev, struct mlxbf_i2c_priv *priv) { @@ -2473,7 +2480,9 @@ static struct platform_driver mlxbf_i2c_driver = { .driver = { .name = "i2c-mlxbf", .of_match_table = mlxbf_i2c_dt_ids, +#ifdef CONFIG_ACPI .acpi_match_table = ACPI_PTR(mlxbf_i2c_acpi_ids), +#endif /* CONFIG_ACPI */ }, }; -- cgit v1.2.3 From 08e019e27a9ed0d6c410176cab4e029e3d233cb8 Mon Sep 17 00:00:00 2001 From: Khalil Blaiech Date: Tue, 3 Nov 2020 14:54:39 -0500 Subject: i2c: mlxbf: Fix resrticted cast warning of sparse Address warnings "warning: cast to restricted __be32" reported by sparse. Fixes: b5b5b32081cd206b ("i2c: mlxbf: I2C SMBus driver for Mellanox BlueField SoC") Reported-by: kernel test robot Reviewed-by: Leon Romanovsky Reviewed-by: Vadim Pasternak Signed-off-by: Khalil Blaiech Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mlxbf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-mlxbf.c b/drivers/i2c/busses/i2c-mlxbf.c index e18f595b37a7..4bd80b1b6a29 100644 --- a/drivers/i2c/busses/i2c-mlxbf.c +++ b/drivers/i2c/busses/i2c-mlxbf.c @@ -510,7 +510,7 @@ static u32 mlxbf_i2c_read(void __iomem *io, int reg) */ static u32 mlxbf_i2c_read_data(void __iomem *io, int reg) { - return (u32)be32_to_cpu(mlxbf_i2c_read(io, reg)); + return ioread32be(io + reg); } /* @@ -524,7 +524,7 @@ static u32 mlxbf_i2c_read_data(void __iomem *io, int reg) */ static void mlxbf_i2c_write_data(void __iomem *io, int reg, u32 val) { - mlxbf_i2c_write(io, reg, (u32)cpu_to_be32(val)); + iowrite32be(val, io + reg); } /* -- cgit v1.2.3 From 4b19d806ac5272bb9f64d78ca6832867870eb45d Mon Sep 17 00:00:00 2001 From: Khalil Blaiech Date: Tue, 3 Nov 2020 14:54:40 -0500 Subject: i2c: mlxbf: Remove unecessary wrapper functions Few wrapper functions are useless and can be inlined. So delete mlxbf_i2c_read() and mlxbf_i2c_write() and replace them with readl() and writel(), respectively. Also delete mlxbf_i2c_read_data() and mlxbf_i2c_write() and replace them with ioread32be() and iowrite32be(), respectively. Fixes: b5b5b32081cd206b ("i2c: mlxbf: I2C SMBus driver for Mellanox BlueField SoC") Reviewed-by: Leon Romanovsky Reviewed-by: Vadim Pasternak Signed-off-by: Khalil Blaiech Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mlxbf.c | 183 ++++++++++++++++------------------------- 1 file changed, 72 insertions(+), 111 deletions(-) diff --git a/drivers/i2c/busses/i2c-mlxbf.c b/drivers/i2c/busses/i2c-mlxbf.c index 4bd80b1b6a29..fca8a3bddcb1 100644 --- a/drivers/i2c/busses/i2c-mlxbf.c +++ b/drivers/i2c/busses/i2c-mlxbf.c @@ -489,44 +489,6 @@ static struct mutex mlxbf_i2c_bus_lock; #define MLXBF_I2C_FREQUENCY_1GHZ 1000000000 -static void mlxbf_i2c_write(void __iomem *io, int reg, u32 val) -{ - writel(val, io + reg); -} - -static u32 mlxbf_i2c_read(void __iomem *io, int reg) -{ - return readl(io + reg); -} - -/* - * This function is used to read data from Master GW Data Descriptor. - * Data bytes in the Master GW Data Descriptor are shifted left so the - * data starts at the MSB of the descriptor registers as set by the - * underlying hardware. TYU_READ_DATA enables byte swapping while - * reading data bytes, and MUST be called by the SMBus read routines - * to copy data from the 32 * 32-bit HW Data registers a.k.a Master GW - * Data Descriptor. - */ -static u32 mlxbf_i2c_read_data(void __iomem *io, int reg) -{ - return ioread32be(io + reg); -} - -/* - * This function is used to write data to the Master GW Data Descriptor. - * Data copied to the Master GW Data Descriptor MUST be shifted left so - * the data starts at the MSB of the descriptor registers as required by - * the underlying hardware. TYU_WRITE_DATA enables byte swapping when - * writing data bytes, and MUST be called by the SMBus write routines to - * copy data to the 32 * 32-bit HW Data registers a.k.a Master GW Data - * Descriptor. - */ -static void mlxbf_i2c_write_data(void __iomem *io, int reg, u32 val) -{ - iowrite32be(val, io + reg); -} - /* * Function to poll a set of bits at a specific address; it checks whether * the bits are equal to zero when eq_zero is set to 'true', and not equal @@ -541,7 +503,7 @@ static u32 mlxbf_smbus_poll(void __iomem *io, u32 addr, u32 mask, timeout = (timeout / MLXBF_I2C_POLL_FREQ_IN_USEC) + 1; do { - bits = mlxbf_i2c_read(io, addr) & mask; + bits = readl(io + addr) & mask; if (eq_zero ? bits == 0 : bits != 0) return eq_zero ? 1 : bits; udelay(MLXBF_I2C_POLL_FREQ_IN_USEC); @@ -609,16 +571,16 @@ static int mlxbf_i2c_smbus_check_status(struct mlxbf_i2c_priv *priv) MLXBF_I2C_SMBUS_TIMEOUT); /* Read cause status bits. */ - cause_status_bits = mlxbf_i2c_read(priv->mst_cause->io, - MLXBF_I2C_CAUSE_ARBITER); + cause_status_bits = readl(priv->mst_cause->io + + MLXBF_I2C_CAUSE_ARBITER); cause_status_bits &= MLXBF_I2C_CAUSE_MASTER_ARBITER_BITS_MASK; /* * Parse both Cause and Master GW bits, then return transaction status. */ - master_status_bits = mlxbf_i2c_read(priv->smbus->io, - MLXBF_I2C_SMBUS_MASTER_STATUS); + master_status_bits = readl(priv->smbus->io + + MLXBF_I2C_SMBUS_MASTER_STATUS); master_status_bits &= MLXBF_I2C_SMBUS_MASTER_STATUS_MASK; if (mlxbf_i2c_smbus_transaction_success(master_status_bits, @@ -649,10 +611,17 @@ static void mlxbf_i2c_smbus_write_data(struct mlxbf_i2c_priv *priv, aligned_length = round_up(length, 4); - /* Copy data bytes from 4-byte aligned source buffer. */ + /* + * Copy data bytes from 4-byte aligned source buffer. + * Data copied to the Master GW Data Descriptor MUST be shifted + * left so the data starts at the MSB of the descriptor registers + * as required by the underlying hardware. Enable byte swapping + * when writing data bytes to the 32 * 32-bit HW Data registers + * a.k.a Master GW Data Descriptor. + */ for (offset = 0; offset < aligned_length; offset += sizeof(u32)) { data32 = *((u32 *)(data + offset)); - mlxbf_i2c_write_data(priv->smbus->io, addr + offset, data32); + iowrite32be(data32, priv->smbus->io + addr + offset); } } @@ -664,15 +633,23 @@ static void mlxbf_i2c_smbus_read_data(struct mlxbf_i2c_priv *priv, mask = sizeof(u32) - 1; + /* + * Data bytes in the Master GW Data Descriptor are shifted left + * so the data starts at the MSB of the descriptor registers as + * set by the underlying hardware. Enable byte swapping while + * reading data bytes from the 32 * 32-bit HW Data registers + * a.k.a Master GW Data Descriptor. + */ + for (offset = 0; offset < (length & ~mask); offset += sizeof(u32)) { - data32 = mlxbf_i2c_read_data(priv->smbus->io, addr + offset); + data32 = ioread32be(priv->smbus->io + addr + offset); *((u32 *)(data + offset)) = data32; } if (!(length & mask)) return; - data32 = mlxbf_i2c_read_data(priv->smbus->io, addr + offset); + data32 = ioread32be(priv->smbus->io + addr + offset); for (byte = 0; byte < (length & mask); byte++) { data[offset + byte] = data32 & GENMASK(7, 0); @@ -698,16 +675,16 @@ static int mlxbf_i2c_smbus_enable(struct mlxbf_i2c_priv *priv, u8 slave, command |= rol32(pec_en, MLXBF_I2C_MASTER_SEND_PEC_SHIFT); /* Clear status bits. */ - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_MASTER_STATUS, 0x0); + writel(0x0, priv->smbus->io + MLXBF_I2C_SMBUS_MASTER_STATUS); /* Set the cause data. */ - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_CAUSE_OR_CLEAR, ~0x0); + writel(~0x0, priv->smbus->io + MLXBF_I2C_CAUSE_OR_CLEAR); /* Zero PEC byte. */ - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_MASTER_PEC, 0x0); + writel(0x0, priv->smbus->io + MLXBF_I2C_SMBUS_MASTER_PEC); /* Zero byte count. */ - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_RS_BYTES, 0x0); + writel(0x0, priv->smbus->io + MLXBF_I2C_SMBUS_RS_BYTES); /* GW activation. */ - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_MASTER_GW, command); + writel(command, priv->smbus->io + MLXBF_I2C_SMBUS_MASTER_GW); /* * Poll master status and check status bits. An ACK is sent when @@ -823,8 +800,8 @@ mlxbf_i2c_smbus_start_transaction(struct mlxbf_i2c_priv *priv, * needs to be 'manually' reset. This should be removed in * next tag integration. */ - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_MASTER_FSM, - MLXBF_I2C_SMBUS_MASTER_FSM_PS_STATE_MASK); + writel(MLXBF_I2C_SMBUS_MASTER_FSM_PS_STATE_MASK, + priv->smbus->io + MLXBF_I2C_SMBUS_MASTER_FSM); } return ret; @@ -1113,8 +1090,8 @@ static void mlxbf_i2c_set_timings(struct mlxbf_i2c_priv *priv, timer |= mlxbf_i2c_set_timer(priv, timings->scl_low, false, MLXBF_I2C_MASK_16, MLXBF_I2C_SHIFT_16); - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_TIMER_SCL_LOW_SCL_HIGH, - timer); + writel(timer, priv->smbus->io + + MLXBF_I2C_SMBUS_TIMER_SCL_LOW_SCL_HIGH); timer = mlxbf_i2c_set_timer(priv, timings->sda_rise, false, MLXBF_I2C_MASK_8, MLXBF_I2C_SHIFT_0); @@ -1124,37 +1101,34 @@ static void mlxbf_i2c_set_timings(struct mlxbf_i2c_priv *priv, MLXBF_I2C_MASK_8, MLXBF_I2C_SHIFT_16); timer |= mlxbf_i2c_set_timer(priv, timings->scl_fall, false, MLXBF_I2C_MASK_8, MLXBF_I2C_SHIFT_24); - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_TIMER_FALL_RISE_SPIKE, - timer); + writel(timer, priv->smbus->io + + MLXBF_I2C_SMBUS_TIMER_FALL_RISE_SPIKE); timer = mlxbf_i2c_set_timer(priv, timings->hold_start, true, MLXBF_I2C_MASK_16, MLXBF_I2C_SHIFT_0); timer |= mlxbf_i2c_set_timer(priv, timings->hold_data, true, MLXBF_I2C_MASK_16, MLXBF_I2C_SHIFT_16); - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_TIMER_THOLD, timer); + writel(timer, priv->smbus->io + MLXBF_I2C_SMBUS_TIMER_THOLD); timer = mlxbf_i2c_set_timer(priv, timings->setup_start, true, MLXBF_I2C_MASK_16, MLXBF_I2C_SHIFT_0); timer |= mlxbf_i2c_set_timer(priv, timings->setup_stop, true, MLXBF_I2C_MASK_16, MLXBF_I2C_SHIFT_16); - mlxbf_i2c_write(priv->smbus->io, - MLXBF_I2C_SMBUS_TIMER_TSETUP_START_STOP, timer); + writel(timer, priv->smbus->io + + MLXBF_I2C_SMBUS_TIMER_TSETUP_START_STOP); timer = mlxbf_i2c_set_timer(priv, timings->setup_data, true, MLXBF_I2C_MASK_16, MLXBF_I2C_SHIFT_0); - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_TIMER_TSETUP_DATA, - timer); + writel(timer, priv->smbus->io + MLXBF_I2C_SMBUS_TIMER_TSETUP_DATA); timer = mlxbf_i2c_set_timer(priv, timings->buf, false, MLXBF_I2C_MASK_16, MLXBF_I2C_SHIFT_0); timer |= mlxbf_i2c_set_timer(priv, timings->thigh_max, false, MLXBF_I2C_MASK_16, MLXBF_I2C_SHIFT_16); - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_THIGH_MAX_TBUF, - timer); + writel(timer, priv->smbus->io + MLXBF_I2C_SMBUS_THIGH_MAX_TBUF); timer = timings->timeout; - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_SCL_LOW_TIMEOUT, - timer); + writel(timer, priv->smbus->io + MLXBF_I2C_SMBUS_SCL_LOW_TIMEOUT); } enum mlxbf_i2c_timings_config { @@ -1426,19 +1400,15 @@ static int mlxbf_i2c_init_master(struct platform_device *pdev, * platform firmware; disabling the bus might compromise the system * functionality. */ - config_reg = mlxbf_i2c_read(gpio_res->io, - MLXBF_I2C_GPIO_0_FUNC_EN_0); + config_reg = readl(gpio_res->io + MLXBF_I2C_GPIO_0_FUNC_EN_0); config_reg = MLXBF_I2C_GPIO_SMBUS_GW_ASSERT_PINS(priv->bus, config_reg); - mlxbf_i2c_write(gpio_res->io, MLXBF_I2C_GPIO_0_FUNC_EN_0, - config_reg); + writel(config_reg, gpio_res->io + MLXBF_I2C_GPIO_0_FUNC_EN_0); - config_reg = mlxbf_i2c_read(gpio_res->io, - MLXBF_I2C_GPIO_0_FORCE_OE_EN); + config_reg = readl(gpio_res->io + MLXBF_I2C_GPIO_0_FORCE_OE_EN); config_reg = MLXBF_I2C_GPIO_SMBUS_GW_RESET_PINS(priv->bus, config_reg); - mlxbf_i2c_write(gpio_res->io, MLXBF_I2C_GPIO_0_FORCE_OE_EN, - config_reg); + writel(config_reg, gpio_res->io + MLXBF_I2C_GPIO_0_FORCE_OE_EN); mutex_unlock(gpio_res->lock); @@ -1454,8 +1424,7 @@ static u64 mlxbf_calculate_freq_from_tyu(struct mlxbf_i2c_resource *corepll_res) pad_frequency = MLXBF_I2C_TYU_PLL_IN_FREQ; - corepll_val = mlxbf_i2c_read(corepll_res->io, - MLXBF_I2C_CORE_PLL_REG1); + corepll_val = readl(corepll_res->io + MLXBF_I2C_CORE_PLL_REG1); /* Get Core PLL configuration bits. */ core_f = rol32(corepll_val, MLXBF_I2C_COREPLL_CORE_F_TYU_SHIFT) & @@ -1490,10 +1459,8 @@ static u64 mlxbf_calculate_freq_from_yu(struct mlxbf_i2c_resource *corepll_res) pad_frequency = MLXBF_I2C_YU_PLL_IN_FREQ; - corepll_reg1_val = mlxbf_i2c_read(corepll_res->io, - MLXBF_I2C_CORE_PLL_REG1); - corepll_reg2_val = mlxbf_i2c_read(corepll_res->io, - MLXBF_I2C_CORE_PLL_REG2); + corepll_reg1_val = readl(corepll_res->io + MLXBF_I2C_CORE_PLL_REG1); + corepll_reg2_val = readl(corepll_res->io + MLXBF_I2C_CORE_PLL_REG2); /* Get Core PLL configuration bits */ core_f = rol32(corepll_reg1_val, MLXBF_I2C_COREPLL_CORE_F_YU_SHIFT) & @@ -1585,7 +1552,7 @@ static int mlxbf_slave_enable(struct mlxbf_i2c_priv *priv, u8 addr) * (7-bit address, 1 status bit (1 if enabled, 0 if not)). */ for (reg = 0; reg < reg_cnt; reg++) { - slave_reg = mlxbf_i2c_read(priv->smbus->io, + slave_reg = readl(priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_ADDR_CFG + reg * 0x4); /* * Each register holds 4 slave addresses. So, we have to keep @@ -1643,8 +1610,8 @@ static int mlxbf_slave_enable(struct mlxbf_i2c_priv *priv, u8 addr) /* Enable the slave address and update the register. */ slave_reg |= (1 << MLXBF_I2C_SMBUS_SLAVE_ADDR_EN_BIT) << (byte * 8); - mlxbf_i2c_write(priv->smbus->io, - MLXBF_I2C_SMBUS_SLAVE_ADDR_CFG + reg * 0x4, slave_reg); + writel(slave_reg, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_ADDR_CFG + + reg * 0x4); return 0; } @@ -1668,7 +1635,7 @@ static int mlxbf_slave_disable(struct mlxbf_i2c_priv *priv) * (7-bit address, 1 status bit (1 if enabled, 0 if not)). */ for (reg = 0; reg < reg_cnt; reg++) { - slave_reg = mlxbf_i2c_read(priv->smbus->io, + slave_reg = readl(priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_ADDR_CFG + reg * 0x4); /* Check whether the address slots are empty. */ @@ -1708,8 +1675,8 @@ static int mlxbf_slave_disable(struct mlxbf_i2c_priv *priv) /* Cleanup the slave address slot. */ slave_reg &= ~(GENMASK(7, 0) << (slave_byte * 8)); - mlxbf_i2c_write(priv->smbus->io, - MLXBF_I2C_SMBUS_SLAVE_ADDR_CFG + reg * 0x4, slave_reg); + writel(slave_reg, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_ADDR_CFG + + reg * 0x4); return 0; } @@ -1801,7 +1768,7 @@ static int mlxbf_i2c_init_slave(struct platform_device *pdev, int ret; /* Reset FSM. */ - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_SLAVE_FSM, 0); + writel(0, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_FSM); /* * Enable slave cause interrupt bits. Drive @@ -1810,15 +1777,13 @@ static int mlxbf_i2c_init_slave(struct platform_device *pdev, * masters issue a Read and Write, respectively. But, clear all * interrupts first. */ - mlxbf_i2c_write(priv->slv_cause->io, - MLXBF_I2C_CAUSE_OR_CLEAR, ~0); + writel(~0, priv->slv_cause->io + MLXBF_I2C_CAUSE_OR_CLEAR); int_reg = MLXBF_I2C_CAUSE_READ_WAIT_FW_RESPONSE; int_reg |= MLXBF_I2C_CAUSE_WRITE_SUCCESS; - mlxbf_i2c_write(priv->slv_cause->io, - MLXBF_I2C_CAUSE_OR_EVTEN0, int_reg); + writel(int_reg, priv->slv_cause->io + MLXBF_I2C_CAUSE_OR_EVTEN0); /* Finally, set the 'ready' bit to start handling transactions. */ - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_SLAVE_READY, 0x1); + writel(0x1, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_READY); /* Initialize the cause coalesce resource. */ ret = mlxbf_i2c_init_coalesce(pdev, priv); @@ -1844,23 +1809,21 @@ static bool mlxbf_i2c_has_coalesce(struct mlxbf_i2c_priv *priv, bool *read, MLXBF_I2C_CAUSE_YU_SLAVE_BIT : priv->bus + MLXBF_I2C_CAUSE_TYU_SLAVE_BIT; - coalesce0_reg = mlxbf_i2c_read(priv->coalesce->io, - MLXBF_I2C_CAUSE_COALESCE_0); + coalesce0_reg = readl(priv->coalesce->io + MLXBF_I2C_CAUSE_COALESCE_0); is_set = coalesce0_reg & (1 << slave_shift); if (!is_set) return false; /* Check the source of the interrupt, i.e. whether a Read or Write. */ - cause_reg = mlxbf_i2c_read(priv->slv_cause->io, - MLXBF_I2C_CAUSE_ARBITER); + cause_reg = readl(priv->slv_cause->io + MLXBF_I2C_CAUSE_ARBITER); if (cause_reg & MLXBF_I2C_CAUSE_READ_WAIT_FW_RESPONSE) *read = true; else if (cause_reg & MLXBF_I2C_CAUSE_WRITE_SUCCESS) *write = true; /* Clear cause bits. */ - mlxbf_i2c_write(priv->slv_cause->io, MLXBF_I2C_CAUSE_OR_CLEAR, ~0x0); + writel(~0x0, priv->slv_cause->io + MLXBF_I2C_CAUSE_OR_CLEAR); return true; } @@ -1900,8 +1863,8 @@ static int mlxbf_smbus_irq_send(struct mlxbf_i2c_priv *priv, u8 recv_bytes) * address, if supplied. */ if (recv_bytes > 0) { - data32 = mlxbf_i2c_read_data(priv->smbus->io, - MLXBF_I2C_SLAVE_DATA_DESC_ADDR); + data32 = ioread32be(priv->smbus->io + + MLXBF_I2C_SLAVE_DATA_DESC_ADDR); /* Parse the received bytes. */ switch (recv_bytes) { @@ -1966,7 +1929,7 @@ static int mlxbf_smbus_irq_send(struct mlxbf_i2c_priv *priv, u8 recv_bytes) control32 |= rol32(write_size, MLXBF_I2C_SLAVE_WRITE_BYTES_SHIFT); control32 |= rol32(pec_en, MLXBF_I2C_SLAVE_SEND_PEC_SHIFT); - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_SLAVE_GW, control32); + writel(control32, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_GW); /* * Wait until the transfer is completed; the driver will wait @@ -1975,10 +1938,9 @@ static int mlxbf_smbus_irq_send(struct mlxbf_i2c_priv *priv, u8 recv_bytes) mlxbf_smbus_slave_wait_for_idle(priv, MLXBF_I2C_SMBUS_TIMEOUT); /* Release the Slave GW. */ - mlxbf_i2c_write(priv->smbus->io, - MLXBF_I2C_SMBUS_SLAVE_RS_MASTER_BYTES, 0x0); - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_SLAVE_PEC, 0x0); - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_SLAVE_READY, 0x1); + writel(0x0, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_RS_MASTER_BYTES); + writel(0x0, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_PEC); + writel(0x1, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_READY); return 0; } @@ -2023,10 +1985,9 @@ static int mlxbf_smbus_irq_recv(struct mlxbf_i2c_priv *priv, u8 recv_bytes) i2c_slave_event(slave, I2C_SLAVE_STOP, &value); /* Release the Slave GW. */ - mlxbf_i2c_write(priv->smbus->io, - MLXBF_I2C_SMBUS_SLAVE_RS_MASTER_BYTES, 0x0); - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_SLAVE_PEC, 0x0); - mlxbf_i2c_write(priv->smbus->io, MLXBF_I2C_SMBUS_SLAVE_READY, 0x1); + writel(0x0, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_RS_MASTER_BYTES); + writel(0x0, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_PEC); + writel(0x1, priv->smbus->io + MLXBF_I2C_SMBUS_SLAVE_READY); return ret; } @@ -2061,8 +2022,8 @@ static irqreturn_t mlxbf_smbus_irq(int irq, void *ptr) * slave, if the higher 8 bits are sent then the slave expect N bytes * from the master. */ - rw_bytes_reg = mlxbf_i2c_read(priv->smbus->io, - MLXBF_I2C_SMBUS_SLAVE_RS_MASTER_BYTES); + rw_bytes_reg = readl(priv->smbus->io + + MLXBF_I2C_SMBUS_SLAVE_RS_MASTER_BYTES); recv_bytes = (rw_bytes_reg >> 8) & GENMASK(7, 0); /* -- cgit v1.2.3 From 67ee9fda6ed4958f4caf8f1a3a5102a2c27ddf7b Mon Sep 17 00:00:00 2001 From: Khalil Blaiech Date: Tue, 3 Nov 2020 14:54:41 -0500 Subject: i2c: mlxbf: Update reference clock frequency The reference clock frequency remains the same across Bluefield products. Thus, update the frequency and rename the macro. Fixes: b5b5b32081cd206b ("i2c: mlxbf: I2C SMBus driver for Mellanox BlueField SoC") Reviewed-by: Leon Romanovsky Signed-off-by: Khalil Blaiech Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mlxbf.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/busses/i2c-mlxbf.c b/drivers/i2c/busses/i2c-mlxbf.c index fca8a3bddcb1..afc996d07504 100644 --- a/drivers/i2c/busses/i2c-mlxbf.c +++ b/drivers/i2c/busses/i2c-mlxbf.c @@ -62,10 +62,8 @@ * Master. Default value is set to 400MHz. */ #define MLXBF_I2C_TYU_PLL_OUT_FREQ (400 * 1000 * 1000) -/* Reference clock for Bluefield 1 - 156 MHz. */ -#define MLXBF_I2C_TYU_PLL_IN_FREQ (156 * 1000 * 1000) -/* Reference clock for BlueField 2 - 200 MHz. */ -#define MLXBF_I2C_YU_PLL_IN_FREQ (200 * 1000 * 1000) +/* Reference clock for Bluefield - 156 MHz. */ +#define MLXBF_I2C_PLL_IN_FREQ (156 * 1000 * 1000) /* Constant used to determine the PLL frequency. */ #define MLNXBF_I2C_COREPLL_CONST 16384 @@ -1422,7 +1420,7 @@ static u64 mlxbf_calculate_freq_from_tyu(struct mlxbf_i2c_resource *corepll_res) u32 corepll_val; u16 core_f; - pad_frequency = MLXBF_I2C_TYU_PLL_IN_FREQ; + pad_frequency = MLXBF_I2C_PLL_IN_FREQ; corepll_val = readl(corepll_res->io + MLXBF_I2C_CORE_PLL_REG1); @@ -1457,7 +1455,7 @@ static u64 mlxbf_calculate_freq_from_yu(struct mlxbf_i2c_resource *corepll_res) u8 core_od, core_r; u32 core_f; - pad_frequency = MLXBF_I2C_YU_PLL_IN_FREQ; + pad_frequency = MLXBF_I2C_PLL_IN_FREQ; corepll_reg1_val = readl(corepll_res->io + MLXBF_I2C_CORE_PLL_REG1); corepll_reg2_val = readl(corepll_res->io + MLXBF_I2C_CORE_PLL_REG2); -- cgit v1.2.3 From 54b9c3d0cea53f84024eed11ff8b6807e2ec81bf Mon Sep 17 00:00:00 2001 From: Khalil Blaiech Date: Tue, 3 Nov 2020 14:54:42 -0500 Subject: i2c: mlxbf: Update author and maintainer email info Correct the email addresses of the author and the maintainer of the Mellanox BlueField I2C driver. Fixes: b5b5b32081cd206b ("i2c: mlxbf: I2C SMBus driver for Mellanox BlueField SoC") Reviewed-by: Leon Romanovsky Signed-off-by: Khalil Blaiech Signed-off-by: Wolfram Sang --- MAINTAINERS | 2 +- drivers/i2c/busses/i2c-mlxbf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index b516bb34a8d5..551587f4b74e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11163,7 +11163,7 @@ F: Documentation/devicetree/bindings/input/touchscreen/melfas_mip4.txt F: drivers/input/touchscreen/melfas_mip4.c MELLANOX BLUEFIELD I2C DRIVER -M: Khalil Blaiech +M: Khalil Blaiech L: linux-i2c@vger.kernel.org S: Supported F: drivers/i2c/busses/i2c-mlxbf.c diff --git a/drivers/i2c/busses/i2c-mlxbf.c b/drivers/i2c/busses/i2c-mlxbf.c index afc996d07504..33574d40ea9c 100644 --- a/drivers/i2c/busses/i2c-mlxbf.c +++ b/drivers/i2c/busses/i2c-mlxbf.c @@ -2470,5 +2470,5 @@ static void __exit mlxbf_i2c_exit(void) module_exit(mlxbf_i2c_exit); MODULE_DESCRIPTION("Mellanox BlueField I2C bus driver"); -MODULE_AUTHOR("Khalil Blaiech "); +MODULE_AUTHOR("Khalil Blaiech "); MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 9890923be3a6d5e606cf4ae4f4e632a5e38cb37c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 5 Oct 2020 14:49:49 +0200 Subject: i2c: mlxbf: I2C_MLXBF should depend on MELLANOX_PLATFORM The Mellanox BlueField I2C controller is only present on Mellanox BlueField SoCs. Hence add a dependency on MELLANOX_PLATFORM, to prevent asking the user about this driver when configuring a kernel without Mellanox platform support. Fixes: b5b5b32081cd206b ("i2c: mlxbf: I2C SMBus driver for Mellanox BlueField SoC") Signed-off-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index a4f473ef4e5c..a97a9d058198 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -733,7 +733,7 @@ config I2C_LPC2K config I2C_MLXBF tristate "Mellanox BlueField I2C controller" - depends on ARM64 + depends on MELLANOX_PLATFORM && ARM64 help Enabling this option will add I2C SMBus support for Mellanox BlueField system. -- cgit v1.2.3 From 66b92313e2ca9208b5f3ebf5d86e9a818299d8fa Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Fri, 30 Oct 2020 16:04:19 +0800 Subject: i2c: designware: call i2c_dw_read_clear_intrbits_slave() once If some bits were cleared by i2c_dw_read_clear_intrbits_slave() in i2c_dw_isr_slave() and not handled immediately, those cleared bits would not be shown again by later i2c_dw_read_clear_intrbits_slave(). They therefore were forgotten to be handled. i2c_dw_read_clear_intrbits_slave() should be called once in an ISR and take its returned state for all later handlings. Signed-off-by: Michael Wu Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-slave.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 44974b53a626..13de01a0f75f 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -159,7 +159,6 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) u32 raw_stat, stat, enabled, tmp; u8 val = 0, slave_activity; - regmap_read(dev->map, DW_IC_INTR_STAT, &stat); regmap_read(dev->map, DW_IC_ENABLE, &enabled); regmap_read(dev->map, DW_IC_RAW_INTR_STAT, &raw_stat); regmap_read(dev->map, DW_IC_STATUS, &tmp); @@ -168,6 +167,7 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) if (!enabled || !(raw_stat & ~DW_IC_INTR_ACTIVITY) || !dev->slave) return 0; + stat = i2c_dw_read_clear_intrbits_slave(dev); dev_dbg(dev->dev, "%#x STATUS SLAVE_ACTIVITY=%#x : RAW_INTR_STAT=%#x : INTR_STAT=%#x\n", enabled, slave_activity, raw_stat, stat); @@ -188,11 +188,9 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) val); } regmap_read(dev->map, DW_IC_CLR_RD_REQ, &tmp); - stat = i2c_dw_read_clear_intrbits_slave(dev); } else { regmap_read(dev->map, DW_IC_CLR_RD_REQ, &tmp); regmap_read(dev->map, DW_IC_CLR_RX_UNDER, &tmp); - stat = i2c_dw_read_clear_intrbits_slave(dev); } if (!i2c_slave_event(dev->slave, I2C_SLAVE_READ_REQUESTED, @@ -207,7 +205,6 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) regmap_read(dev->map, DW_IC_CLR_RX_DONE, &tmp); i2c_slave_event(dev->slave, I2C_SLAVE_STOP, &val); - stat = i2c_dw_read_clear_intrbits_slave(dev); return 1; } @@ -219,7 +216,6 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) dev_vdbg(dev->dev, "Byte %X acked!", val); } else { i2c_slave_event(dev->slave, I2C_SLAVE_STOP, &val); - stat = i2c_dw_read_clear_intrbits_slave(dev); } return 1; @@ -230,7 +226,6 @@ static irqreturn_t i2c_dw_isr_slave(int this_irq, void *dev_id) struct dw_i2c_dev *dev = dev_id; int ret; - i2c_dw_read_clear_intrbits_slave(dev); ret = i2c_dw_irq_handler_slave(dev); if (ret > 0) complete(&dev->cmd_complete); -- cgit v1.2.3 From 3b5f7f10ff6e6b66f553e12cc50d9bb751ce60ad Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Fri, 30 Oct 2020 16:04:20 +0800 Subject: i2c: designware: slave should do WRITE_REQUESTED before WRITE_RECEIVED Sometimes we would get the following flow when doing an i2cset: 0x1 STATUS SLAVE_ACTIVITY=0x1 : RAW_INTR_STAT=0x514 : INTR_STAT=0x4 I2C_SLAVE_WRITE_RECEIVED 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x714 : INTR_STAT=0x204 I2C_SLAVE_WRITE_REQUESTED I2C_SLAVE_WRITE_RECEIVED Documentation/i2c/slave-interface.rst says that I2C_SLAVE_WRITE_REQUESTED, which is mandatory, should be sent while the data did not arrive yet. It means in a write-request I2C_SLAVE_WRITE_REQUESTED should be reported before any I2C_SLAVE_WRITE_RECEIVED. By the way, I2C_SLAVE_STOP didn't be reported in the above case because DW_IC_INTR_STAT was not 0x200. dev->status can be used to record the current state, especially Designware I2C controller has no interrupts to identify a write-request. This patch makes not only I2C_SLAVE_WRITE_REQUESTED been reported first when IC_INTR_RX_FULL is rising and dev->status isn't STATUS_WRITE_IN_PROGRESS but also I2C_SLAVE_STOP been reported when a STOP condition is received. Signed-off-by: Michael Wu Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-slave.c | 45 +++++++++++++------------------ 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 13de01a0f75f..0d15f4c1e9f7 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -172,26 +172,25 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) "%#x STATUS SLAVE_ACTIVITY=%#x : RAW_INTR_STAT=%#x : INTR_STAT=%#x\n", enabled, slave_activity, raw_stat, stat); - if ((stat & DW_IC_INTR_RX_FULL) && (stat & DW_IC_INTR_STOP_DET)) - i2c_slave_event(dev->slave, I2C_SLAVE_WRITE_REQUESTED, &val); + if (stat & DW_IC_INTR_RX_FULL) { + if (dev->status != STATUS_WRITE_IN_PROGRESS) { + dev->status = STATUS_WRITE_IN_PROGRESS; + i2c_slave_event(dev->slave, I2C_SLAVE_WRITE_REQUESTED, + &val); + } + + regmap_read(dev->map, DW_IC_DATA_CMD, &tmp); + val = tmp; + if (!i2c_slave_event(dev->slave, I2C_SLAVE_WRITE_RECEIVED, + &val)) + dev_vdbg(dev->dev, "Byte %X acked!", val); + } if (stat & DW_IC_INTR_RD_REQ) { if (slave_activity) { - if (stat & DW_IC_INTR_RX_FULL) { - regmap_read(dev->map, DW_IC_DATA_CMD, &tmp); - val = tmp; - - if (!i2c_slave_event(dev->slave, - I2C_SLAVE_WRITE_RECEIVED, - &val)) { - dev_vdbg(dev->dev, "Byte %X acked!", - val); - } - regmap_read(dev->map, DW_IC_CLR_RD_REQ, &tmp); - } else { - regmap_read(dev->map, DW_IC_CLR_RD_REQ, &tmp); - regmap_read(dev->map, DW_IC_CLR_RX_UNDER, &tmp); - } + regmap_read(dev->map, DW_IC_CLR_RD_REQ, &tmp); + + dev->status = STATUS_READ_IN_PROGRESS; if (!i2c_slave_event(dev->slave, I2C_SLAVE_READ_REQUESTED, &val)) @@ -203,18 +202,10 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) if (!i2c_slave_event(dev->slave, I2C_SLAVE_READ_PROCESSED, &val)) regmap_read(dev->map, DW_IC_CLR_RX_DONE, &tmp); - - i2c_slave_event(dev->slave, I2C_SLAVE_STOP, &val); - return 1; } - if (stat & DW_IC_INTR_RX_FULL) { - regmap_read(dev->map, DW_IC_DATA_CMD, &tmp); - val = tmp; - if (!i2c_slave_event(dev->slave, I2C_SLAVE_WRITE_RECEIVED, - &val)) - dev_vdbg(dev->dev, "Byte %X acked!", val); - } else { + if (stat & DW_IC_INTR_STOP_DET) { + dev->status = STATUS_IDLE; i2c_slave_event(dev->slave, I2C_SLAVE_STOP, &val); } -- cgit v1.2.3 From faf000397e7f103df9953a312e1df21df1dc797f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 Nov 2020 11:30:09 +1100 Subject: KVM: arm64: Fix build error in user_mem_abort() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PUD and PMD are folded into PGD when the following options are enabled. In that case, PUD_SHIFT is equal to PMD_SHIFT and we fail to build with the indicated errors: CONFIG_ARM64_VA_BITS_42=y CONFIG_ARM64_PAGE_SHIFT=16 CONFIG_PGTABLE_LEVELS=3 arch/arm64/kvm/mmu.c: In function ‘user_mem_abort’: arch/arm64/kvm/mmu.c:798:2: error: duplicate case value case PMD_SHIFT: ^~~~ arch/arm64/kvm/mmu.c:791:2: note: previously used here case PUD_SHIFT: ^~~~ This fixes the issue by skipping the check on PUD huge page when PUD and PMD are folded into PGD. Fixes: 2f40c46021bbb ("KVM: arm64: Use fallback mapping sizes for contiguous huge page sizes") Reported-by: Eric Auger Signed-off-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201103003009.32955-1-gshan@redhat.com --- arch/arm64/kvm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c7c6df6309d5..a109c5001827 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -788,10 +788,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } switch (vma_shift) { +#ifndef __PAGETABLE_PMD_FOLDED case PUD_SHIFT: if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) break; fallthrough; +#endif case CONT_PMD_SHIFT: vma_shift = PMD_SHIFT; fallthrough; -- cgit v1.2.3 From f81cb2c3ad41ac6d8cb2650e3d72d5f67db1aa28 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Thu, 5 Nov 2020 10:10:19 +0100 Subject: KVM: arm64: Don't hide ID registers from userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ID registers are RAZ until they've been allocated a purpose, but that doesn't mean they should be removed from the KVM_GET_REG_LIST list. So far we only have one register, SYS_ID_AA64ZFR0_EL1, that is hidden from userspace when its function, SVE, is not present. Expose SYS_ID_AA64ZFR0_EL1 to userspace as RAZ when SVE is not implemented. Removing the userspace visibility checks is enough to reexpose it, as it will already return zero to userspace when SVE is not present. The register already behaves as RAZ for the guest when SVE is not present. Fixes: 73433762fcae ("KVM: arm64/sve: System register context switch and access support") Reported-by: 张东旭 Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org#v5.2+ Link: https://lore.kernel.org/r/20201105091022.15373-2-drjones@redhat.com --- arch/arm64/kvm/sys_regs.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 983994f01a63..3af306e6b9cd 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1193,16 +1193,6 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN_USER | REG_HIDDEN_GUEST; } -/* Visibility overrides for SVE-specific ID registers */ -static unsigned int sve_id_visibility(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - if (vcpu_has_sve(vcpu)) - return 0; - - return REG_HIDDEN_USER; -} - /* Generate the emulated ID_AA64ZFR0_EL1 value exposed to the guest */ static u64 guest_id_aa64zfr0_el1(const struct kvm_vcpu *vcpu) { @@ -1229,9 +1219,6 @@ static int get_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, { u64 val; - if (WARN_ON(!vcpu_has_sve(vcpu))) - return -ENOENT; - val = guest_id_aa64zfr0_el1(vcpu); return reg_to_user(uaddr, &val, reg->id); } @@ -1244,9 +1231,6 @@ static int set_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, int err; u64 val; - if (WARN_ON(!vcpu_has_sve(vcpu))) - return -ENOENT; - err = reg_from_user(&val, uaddr, id); if (err) return err; @@ -1509,7 +1493,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_SANITISED(ID_AA64PFR1_EL1), ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), - { SYS_DESC(SYS_ID_AA64ZFR0_EL1), access_id_aa64zfr0_el1, .get_user = get_id_aa64zfr0_el1, .set_user = set_id_aa64zfr0_el1, .visibility = sve_id_visibility }, + { SYS_DESC(SYS_ID_AA64ZFR0_EL1), access_id_aa64zfr0_el1, .get_user = get_id_aa64zfr0_el1, .set_user = set_id_aa64zfr0_el1, }, ID_UNALLOCATED(4,5), ID_UNALLOCATED(4,6), ID_UNALLOCATED(4,7), -- cgit v1.2.3 From 01fe5ace92ddb8732e3331355e7ba9cb6f2ef787 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Thu, 5 Nov 2020 10:10:20 +0100 Subject: KVM: arm64: Consolidate REG_HIDDEN_GUEST/USER REG_HIDDEN_GUEST and REG_HIDDEN_USER are always used together. Consolidate them into a single REG_HIDDEN flag. We can always add another flag later if some register needs to expose itself differently to the guest than it does to userspace. No functional change intended. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201105091022.15373-3-drjones@redhat.com --- arch/arm64/kvm/sys_regs.c | 12 ++++++------ arch/arm64/kvm/sys_regs.h | 18 ++++-------------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 3af306e6b9cd..26a285127620 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1069,7 +1069,7 @@ static bool trap_ptrauth(struct kvm_vcpu *vcpu, static unsigned int ptrauth_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN_USER | REG_HIDDEN_GUEST; + return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN; } #define __PTRAUTH_KEY(k) \ @@ -1190,7 +1190,7 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, if (vcpu_has_sve(vcpu)) return 0; - return REG_HIDDEN_USER | REG_HIDDEN_GUEST; + return REG_HIDDEN; } /* Generate the emulated ID_AA64ZFR0_EL1 value exposed to the guest */ @@ -2153,7 +2153,7 @@ static void perform_access(struct kvm_vcpu *vcpu, trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); /* Check for regs disabled by runtime config */ - if (sysreg_hidden_from_guest(vcpu, r)) { + if (sysreg_hidden(vcpu, r)) { kvm_inject_undefined(vcpu); return; } @@ -2652,7 +2652,7 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg return get_invariant_sys_reg(reg->id, uaddr); /* Check for regs disabled by runtime config */ - if (sysreg_hidden_from_user(vcpu, r)) + if (sysreg_hidden(vcpu, r)) return -ENOENT; if (r->get_user) @@ -2677,7 +2677,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg return set_invariant_sys_reg(reg->id, uaddr); /* Check for regs disabled by runtime config */ - if (sysreg_hidden_from_user(vcpu, r)) + if (sysreg_hidden(vcpu, r)) return -ENOENT; if (r->set_user) @@ -2748,7 +2748,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, if (!(rd->reg || rd->get_user)) return 0; - if (sysreg_hidden_from_user(vcpu, rd)) + if (sysreg_hidden(vcpu, rd)) return 0; if (!copy_reg_to_user(rd, uind)) diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 5a6fc30f5989..2641b2ee6a91 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -59,8 +59,7 @@ struct sys_reg_desc { const struct sys_reg_desc *rd); }; -#define REG_HIDDEN_USER (1 << 0) /* hidden from userspace ioctls */ -#define REG_HIDDEN_GUEST (1 << 1) /* hidden from guest */ +#define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, @@ -111,22 +110,13 @@ static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r __vcpu_sys_reg(vcpu, r->reg) = r->val; } -static inline bool sysreg_hidden_from_guest(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *r) -{ - if (likely(!r->visibility)) - return false; - - return r->visibility(vcpu, r) & REG_HIDDEN_GUEST; -} - -static inline bool sysreg_hidden_from_user(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *r) +static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) { if (likely(!r->visibility)) return false; - return r->visibility(vcpu, r) & REG_HIDDEN_USER; + return r->visibility(vcpu, r) & REG_HIDDEN; } static inline int cmp_sys_reg(const struct sys_reg_desc *i1, -- cgit v1.2.3 From 912dee572691ffb2b387dd8b4f183d549a6b24d1 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Thu, 5 Nov 2020 10:10:21 +0100 Subject: KVM: arm64: Check RAZ visibility in ID register accessors The instruction encodings of ID registers are preallocated. Until an encoding is assigned a purpose the register is RAZ. KVM's general ID register accessor functions already support both paths, RAZ or not. If for each ID register we can determine if it's RAZ or not, then all ID registers can build on the general functions. The register visibility function allows us to check whether a register should be completely hidden or not, extending it to also report when the register should be RAZ or not allows us to use it for ID registers as well. Check for RAZ visibility in the ID register accessor functions, allowing the RAZ case to be handled in a generic way for all system registers. The new REG_RAZ flag will be used in a later patch. This patch has no intended functional change. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201105091022.15373-4-drjones@redhat.com --- arch/arm64/kvm/sys_regs.c | 19 ++++++++++++++++--- arch/arm64/kvm/sys_regs.h | 10 ++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 26a285127620..4ee098abd87c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1151,6 +1151,12 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, return val; } +static unsigned int id_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return 0; +} + /* cpufeature ID register access trap handlers */ static bool __access_id_reg(struct kvm_vcpu *vcpu, @@ -1169,7 +1175,9 @@ static bool access_id_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - return __access_id_reg(vcpu, p, r, false); + bool raz = sysreg_visible_as_raz(vcpu, r); + + return __access_id_reg(vcpu, p, r, raz); } static bool access_raz_id_reg(struct kvm_vcpu *vcpu, @@ -1281,13 +1289,17 @@ static int __set_id_reg(const struct kvm_vcpu *vcpu, static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(vcpu, rd, uaddr, false); + bool raz = sysreg_visible_as_raz(vcpu, rd); + + return __get_id_reg(vcpu, rd, uaddr, raz); } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(vcpu, rd, uaddr, false); + bool raz = sysreg_visible_as_raz(vcpu, rd); + + return __set_id_reg(vcpu, rd, uaddr, raz); } static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, @@ -1372,6 +1384,7 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, .access = access_id_reg, \ .get_user = get_id_reg, \ .set_user = set_id_reg, \ + .visibility = id_visibility, \ } /* diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 2641b2ee6a91..0f95964339b1 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -60,6 +60,7 @@ struct sys_reg_desc { }; #define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ +#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, @@ -119,6 +120,15 @@ static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, return r->visibility(vcpu, r) & REG_HIDDEN; } +static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + if (likely(!r->visibility)) + return false; + + return r->visibility(vcpu, r) & REG_RAZ; +} + static inline int cmp_sys_reg(const struct sys_reg_desc *i1, const struct sys_reg_desc *i2) { -- cgit v1.2.3 From c512298eed0360923d0cbc4a1f30bc0509af0d50 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Thu, 5 Nov 2020 10:10:22 +0100 Subject: KVM: arm64: Remove AA64ZFR0_EL1 accessors The AA64ZFR0_EL1 accessors are just the general accessors with its visibility function open-coded. It also skips the if-else chain in read_id_reg, but there's no reason not to go there. Indeed consolidating ID register accessors and removing lines of code make it worthwhile. Remove the AA64ZFR0_EL1 accessors, replacing them with the general accessors for sanitized ID registers. No functional change intended. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201105091022.15373-5-drjones@redhat.com --- arch/arm64/kvm/sys_regs.c | 61 +++++++++-------------------------------------- 1 file changed, 11 insertions(+), 50 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 4ee098abd87c..436043cd327e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1154,6 +1154,16 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, static unsigned int id_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { + u32 id = sys_reg((u32)r->Op0, (u32)r->Op1, + (u32)r->CRn, (u32)r->CRm, (u32)r->Op2); + + switch (id) { + case SYS_ID_AA64ZFR0_EL1: + if (!vcpu_has_sve(vcpu)) + return REG_RAZ; + break; + } + return 0; } @@ -1201,55 +1211,6 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } -/* Generate the emulated ID_AA64ZFR0_EL1 value exposed to the guest */ -static u64 guest_id_aa64zfr0_el1(const struct kvm_vcpu *vcpu) -{ - if (!vcpu_has_sve(vcpu)) - return 0; - - return read_sanitised_ftr_reg(SYS_ID_AA64ZFR0_EL1); -} - -static bool access_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - if (p->is_write) - return write_to_read_only(vcpu, p, rd); - - p->regval = guest_id_aa64zfr0_el1(vcpu); - return true; -} - -static int get_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) -{ - u64 val; - - val = guest_id_aa64zfr0_el1(vcpu); - return reg_to_user(uaddr, &val, reg->id); -} - -static int set_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) -{ - const u64 id = sys_reg_to_index(rd); - int err; - u64 val; - - err = reg_from_user(&val, uaddr, id); - if (err) - return err; - - /* This is what we mean by invariant: you can't change it. */ - if (val != guest_id_aa64zfr0_el1(vcpu)) - return -EINVAL; - - return 0; -} - /* * cpufeature ID register user accessors * @@ -1506,7 +1467,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_SANITISED(ID_AA64PFR1_EL1), ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), - { SYS_DESC(SYS_ID_AA64ZFR0_EL1), access_id_aa64zfr0_el1, .get_user = get_id_aa64zfr0_el1, .set_user = set_id_aa64zfr0_el1, }, + ID_SANITISED(ID_AA64ZFR0_EL1), ID_UNALLOCATED(4,5), ID_UNALLOCATED(4,6), ID_UNALLOCATED(4,7), -- cgit v1.2.3 From 912ab37c798770f21b182d656937072b58553378 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Mon, 2 Nov 2020 20:07:49 +0800 Subject: serial: 8250_mtk: Fix uart_get_baud_rate warning Mediatek 8250 port supports speed higher than uartclk / 16. If the baud rates in both the new and the old termios setting are higher than uartclk / 16, the WARN_ON in uart_get_baud_rate() will be triggered. Passing NULL as the old termios so uart_get_baud_rate() will use uartclk / 16 - 1 as the new baud rate which will be replaced by the original baud rate later by tty_termios_encode_baud_rate() in mtk8250_set_termios(). Fixes: 551e553f0d4a ("serial: 8250_mtk: Fix high-speed baud rates clamping") Signed-off-by: Claire Chang Link: https://lore.kernel.org/r/20201102120749.374458-1-tientzu@chromium.org Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_mtk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c index 41f4120abdf2..fa876e2c13e5 100644 --- a/drivers/tty/serial/8250/8250_mtk.c +++ b/drivers/tty/serial/8250/8250_mtk.c @@ -317,7 +317,7 @@ mtk8250_set_termios(struct uart_port *port, struct ktermios *termios, */ baud = tty_termios_baud_rate(termios); - serial8250_do_set_termios(port, termios, old); + serial8250_do_set_termios(port, termios, NULL); tty_termios_encode_baud_rate(termios, baud, baud); -- cgit v1.2.3 From 427627a23c3e86e31113f9db9bfdca41698a0ee5 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Thu, 5 Nov 2020 21:40:26 +0100 Subject: tty: serial: imx: enable earlycon by default if IMX_SERIAL_CONSOLE is enabled Since 699cc4dfd140 (tty: serial: imx: add imx earlycon driver), the earlycon part of imx serial is a separate driver and isn't necessarily enabled anymore when the console is enabled. This causes users to loose the earlycon functionality when upgrading their kenrel configuration via oldconfig. Enable earlycon by default when IMX_SERIAL_CONSOLE is enabled. Fixes: 699cc4dfd140 (tty: serial: imx: add imx earlycon driver) Reviewed-by: Fabio Estevam Reviewed-by: Fugang Duan Signed-off-by: Lucas Stach Link: https://lore.kernel.org/r/20201105204026.1818219-1-l.stach@pengutronix.de Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 1044fc387691..28f22e58639c 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -522,6 +522,7 @@ config SERIAL_IMX_EARLYCON depends on OF select SERIAL_EARLYCON select SERIAL_CORE_CONSOLE + default y if SERIAL_IMX_CONSOLE help If you have enabled the earlycon on the Freescale IMX CPU you can make it the earlycon by answering Y to this option. -- cgit v1.2.3 From 0c5fc92622ed5531ff324b20f014e9e3092f0187 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Tue, 3 Nov 2020 16:49:42 +0800 Subject: serial: txx9: add missing platform_driver_unregister() on error in serial_txx9_init Add the missing platform_driver_unregister() before return from serial_txx9_init in the error handling case when failed to register serial_txx9_pci_driver with macro ENABLE_SERIAL_TXX9_PCI defined. Fixes: ab4382d27412 ("tty: move drivers/serial/ to drivers/tty/serial/") Signed-off-by: Qinglang Miao Link: https://lore.kernel.org/r/20201103084942.109076-1-miaoqinglang@huawei.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_txx9.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/tty/serial/serial_txx9.c b/drivers/tty/serial/serial_txx9.c index b4d89e31730e..7a07e7272de1 100644 --- a/drivers/tty/serial/serial_txx9.c +++ b/drivers/tty/serial/serial_txx9.c @@ -1280,6 +1280,9 @@ static int __init serial_txx9_init(void) #ifdef ENABLE_SERIAL_TXX9_PCI ret = pci_register_driver(&serial_txx9_pci_driver); + if (ret) { + platform_driver_unregister(&serial_txx9_plat_driver); + } #endif if (ret == 0) goto out; -- cgit v1.2.3 From 4466d6d2f80c1193e0845d110277c56da77a6418 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Thu, 5 Nov 2020 13:34:32 +0100 Subject: tty: fix crash in release_tty if tty->port is not set Commit 2ae0b31e0face ("tty: don't crash in tty_init_dev when missing tty_port") didn't fully prevent the crash as the cleanup path in tty_init_dev() calls release_tty() which dereferences tty->port without checking it for non-null. Add tty->port checks to release_tty to avoid the kernel crash. Fixes: 2ae0b31e0face ("tty: don't crash in tty_init_dev when missing tty_port") Signed-off-by: Matthias Reichl Link: https://lore.kernel.org/r/20201105123432.4448-1-hias@horus.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_io.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 7a4c02548fb3..9f8b9a567b35 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1515,10 +1515,12 @@ static void release_tty(struct tty_struct *tty, int idx) tty->ops->shutdown(tty); tty_save_termios(tty); tty_driver_remove_tty(tty->driver, tty); - tty->port->itty = NULL; + if (tty->port) + tty->port->itty = NULL; if (tty->link) tty->link->port->itty = NULL; - tty_buffer_cancel_work(tty->port); + if (tty->port) + tty_buffer_cancel_work(tty->port); if (tty->link) tty_buffer_cancel_work(tty->link->port); -- cgit v1.2.3 From e1777d099728a76a8f8090f89649aac961e7e530 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 6 Nov 2020 20:01:41 +0900 Subject: null_blk: Fix scheduling in atomic with zoned mode Commit aa1c09cb65e2 ("null_blk: Fix locking in zoned mode") changed zone locking to using the potentially sleeping wait_on_bit_io() function. This is acceptable when memory backing is enabled as the device queue is in that case marked as blocking, but this triggers a scheduling while in atomic context with memory backing disabled. Fix this by relying solely on the device zone spinlock for zone information protection without temporarily releasing this lock around null_process_cmd() execution in null_zone_write(). This is OK to do since when memory backing is disabled, command processing does not block and the memory backing lock nullb->lock is unused. This solution avoids the overhead of having to mark a zoned null_blk device queue as blocking when memory backing is unused. This patch also adds comments to the zone locking code to explain the unusual locking scheme. Fixes: aa1c09cb65e2 ("null_blk: Fix locking in zoned mode") Reported-by: kernel test robot Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 2 +- drivers/block/null_blk_zoned.c | 47 ++++++++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index cfd00ad40355..c24d9b5ad81a 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -47,7 +47,7 @@ struct nullb_device { unsigned int nr_zones_closed; struct blk_zone *zones; sector_t zone_size_sects; - spinlock_t zone_dev_lock; + spinlock_t zone_lock; unsigned long *zone_locks; unsigned long size; /* device size in MB */ diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 8775acbb4f8f..beb34b4f76b0 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -46,11 +46,20 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) if (!dev->zones) return -ENOMEM; - spin_lock_init(&dev->zone_dev_lock); - dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); - if (!dev->zone_locks) { - kvfree(dev->zones); - return -ENOMEM; + /* + * With memory backing, the zone_lock spinlock needs to be temporarily + * released to avoid scheduling in atomic context. To guarantee zone + * information protection, use a bitmap to lock zones with + * wait_on_bit_lock_io(). Sleeping on the lock is OK as memory backing + * implies that the queue is marked with BLK_MQ_F_BLOCKING. + */ + spin_lock_init(&dev->zone_lock); + if (dev->memory_backed) { + dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); + if (!dev->zone_locks) { + kvfree(dev->zones); + return -ENOMEM; + } } if (dev->zone_nr_conv >= dev->nr_zones) { @@ -137,12 +146,17 @@ void null_free_zoned_dev(struct nullb_device *dev) static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) { - wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); + if (dev->memory_backed) + wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); + spin_lock_irq(&dev->zone_lock); } static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) { - clear_and_wake_up_bit(zno, dev->zone_locks); + spin_unlock_irq(&dev->zone_lock); + + if (dev->memory_backed) + clear_and_wake_up_bit(zno, dev->zone_locks); } int null_report_zones(struct gendisk *disk, sector_t sector, @@ -322,7 +336,6 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); null_lock_zone(dev, zno); - spin_lock(&dev->zone_dev_lock); switch (zone->cond) { case BLK_ZONE_COND_FULL: @@ -375,9 +388,17 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, if (zone->cond != BLK_ZONE_COND_EXP_OPEN) zone->cond = BLK_ZONE_COND_IMP_OPEN; - spin_unlock(&dev->zone_dev_lock); + /* + * Memory backing allocation may sleep: release the zone_lock spinlock + * to avoid scheduling in atomic context. Zone operation atomicity is + * still guaranteed through the zone_locks bitmap. + */ + if (dev->memory_backed) + spin_unlock_irq(&dev->zone_lock); ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - spin_lock(&dev->zone_dev_lock); + if (dev->memory_backed) + spin_lock_irq(&dev->zone_lock); + if (ret != BLK_STS_OK) goto unlock; @@ -392,7 +413,6 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, ret = BLK_STS_OK; unlock: - spin_unlock(&dev->zone_dev_lock); null_unlock_zone(dev, zno); return ret; @@ -516,9 +536,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, null_lock_zone(dev, i); zone = &dev->zones[i]; if (zone->cond != BLK_ZONE_COND_EMPTY) { - spin_lock(&dev->zone_dev_lock); null_reset_zone(dev, zone); - spin_unlock(&dev->zone_dev_lock); trace_nullb_zone_op(cmd, i, zone->cond); } null_unlock_zone(dev, i); @@ -530,7 +548,6 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, zone = &dev->zones[zone_no]; null_lock_zone(dev, zone_no); - spin_lock(&dev->zone_dev_lock); switch (op) { case REQ_OP_ZONE_RESET: @@ -550,8 +567,6 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, break; } - spin_unlock(&dev->zone_dev_lock); - if (ret == BLK_STS_OK) trace_nullb_zone_op(cmd, zone_no, zone->cond); -- cgit v1.2.3 From 92cfcd030e4b1de11a6b1edb0840e55c26332d31 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 30 Oct 2020 17:45:56 -0700 Subject: fscrypt: remove reachable WARN in fscrypt_setup_iv_ino_lblk_32_key() I_CREATING isn't actually set until the inode has been assigned an inode number and inserted into the inode hash table. So the WARN_ON() in fscrypt_setup_iv_ino_lblk_32_key() is wrong, and it can trigger when creating an encrypted file on ext4. Remove it. This was sometimes causing xfstest generic/602 to fail on ext4. I didn't notice it before because due to a separate oversight, new inodes that haven't been assigned an inode number yet don't necessarily have i_ino == 0 as I had thought, so by chance I never saw the test fail. Fixes: a992b20cd4ee ("fscrypt: add fscrypt_prepare_new_inode() and fscrypt_set_context()") Reported-by: Theodore Y. Ts'o Link: https://lore.kernel.org/r/20201031004556.87862-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/keysetup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index d3c3e5d9b41f..d595abb8ef90 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -269,9 +269,7 @@ unlock: * New inodes may not have an inode number assigned yet. * Hashing their inode number is delayed until later. */ - if (ci->ci_inode->i_ino == 0) - WARN_ON(!(ci->ci_inode->i_state & I_CREATING)); - else + if (ci->ci_inode->i_ino) fscrypt_hash_inode_number(ci, mk); return 0; } -- cgit v1.2.3 From d4d50710a8b46082224376ef119a4dbb75b25c56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Nov 2020 09:27:33 +0100 Subject: seq_file: add seq_read_iter iov_iter based variant for reading a seq_file. seq_read is reimplemented on top of the iter variant. Signed-off-by: Christoph Hellwig Tested-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- fs/seq_file.c | 45 ++++++++++++++++++++++++++++++++------------- include/linux/seq_file.h | 1 + 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index 31219c1db17d..3b20e21604e7 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -146,7 +147,28 @@ Eoverflow: */ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { - struct seq_file *m = file->private_data; + struct iovec iov = { .iov_base = buf, .iov_len = size}; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + iov_iter_init(&iter, READ, &iov, 1, size); + + kiocb.ki_pos = *ppos; + ret = seq_read_iter(&kiocb, &iter); + *ppos = kiocb.ki_pos; + return ret; +} +EXPORT_SYMBOL(seq_read); + +/* + * Ready-made ->f_op->read_iter() + */ +ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct seq_file *m = iocb->ki_filp->private_data; + size_t size = iov_iter_count(iter); size_t copied = 0; size_t n; void *p; @@ -158,14 +180,14 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ - if (*ppos == 0) { + if (iocb->ki_pos == 0) { m->index = 0; m->count = 0; } - /* Don't assume *ppos is where we left it */ - if (unlikely(*ppos != m->read_pos)) { - while ((err = traverse(m, *ppos)) == -EAGAIN) + /* Don't assume ki_pos is where we left it */ + if (unlikely(iocb->ki_pos != m->read_pos)) { + while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN) ; if (err) { /* With prejudice... */ @@ -174,7 +196,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) m->count = 0; goto Done; } else { - m->read_pos = *ppos; + m->read_pos = iocb->ki_pos; } } @@ -187,13 +209,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* if not empty - flush it first */ if (m->count) { n = min(m->count, size); - err = copy_to_user(buf, m->buf + m->from, n); - if (err) + if (copy_to_iter(m->buf + m->from, n, iter) != n) goto Efault; m->count -= n; m->from += n; size -= n; - buf += n; copied += n; if (!size) goto Done; @@ -254,8 +274,7 @@ Fill: } m->op->stop(m, p); n = min(m->count, size); - err = copy_to_user(buf, m->buf, n); - if (err) + if (copy_to_iter(m->buf, n, iter) != n) goto Efault; copied += n; m->count -= n; @@ -264,7 +283,7 @@ Done: if (!copied) copied = err; else { - *ppos += copied; + iocb->ki_pos += copied; m->read_pos += copied; } mutex_unlock(&m->lock); @@ -276,7 +295,7 @@ Efault: err = -EFAULT; goto Done; } -EXPORT_SYMBOL(seq_read); +EXPORT_SYMBOL(seq_read_iter); /** * seq_lseek - ->llseek() method for sequential files. diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 813614d4b71f..b83b3ae3c877 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -107,6 +107,7 @@ void seq_pad(struct seq_file *m, char c); char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); +ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); int seq_write(struct seq_file *seq, const void *data, size_t len); -- cgit v1.2.3 From fe33850ff798eb8d57eea88cc14090770013bb73 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Nov 2020 09:27:34 +0100 Subject: proc: wire up generic_file_splice_read for iter ops Wire up generic_file_splice_read for the iter based proxy ops, so that splice reads from them work. Signed-off-by: Christoph Hellwig Tested-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 58c075e2a452..bde6b6f69852 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -597,6 +597,7 @@ static const struct file_operations proc_iter_file_ops = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .write = proc_reg_write, + .splice_read = generic_file_splice_read, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, @@ -622,6 +623,7 @@ static const struct file_operations proc_reg_file_ops_compat = { static const struct file_operations proc_iter_file_ops_compat = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, + .splice_read = generic_file_splice_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, -- cgit v1.2.3 From 70fce7d2253938191275ebcbd46efe45fceb05a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Nov 2020 09:27:35 +0100 Subject: proc/cpuinfo: switch to ->read_iter Implement ->read_iter so that the Android bionic test suite can use this random proc file for its splice test case. Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/proc/cpuinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index d0989a443c77..419760fd77bd 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -19,7 +19,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file) static const struct proc_ops cpuinfo_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = cpuinfo_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = seq_release, }; -- cgit v1.2.3 From 28589f9e0f942377e9994711f5765b01d1b8eaa2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Nov 2020 09:27:36 +0100 Subject: proc/stat: switch to ->read_iter Implement ->read_iter so that splice can be used on this file. Suggested-by: Linus Torvalds Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 46b3293015fe..4695b6de3151 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -226,7 +226,7 @@ static int stat_open(struct inode *inode, struct file *file) static const struct proc_ops stat_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; -- cgit v1.2.3 From 7cfc630e63b4f7b2ab5a1238c566a6b799ae1624 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 4 Nov 2020 09:27:37 +0100 Subject: proc "single files": switch to ->read_iter Implement ->read_iter for all proc "single files" so that more bionic tests cases can pass when they call splice() on other fun files like /proc/version Signed-off-by: Greg Kroah-Hartman Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 2f9fa179194d..f81327673f49 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -621,7 +621,7 @@ static int proc_single_open(struct inode *inode, struct file *file) static const struct proc_ops proc_single_ops = { /* not permanent -- can call into arbitrary ->single_show */ .proc_open = proc_single_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; -- cgit v1.2.3 From b24c30c678630e48cf8e3caefe463e1c6144d029 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Nov 2020 09:27:38 +0100 Subject: proc "seq files": switch to ->read_iter Implement ->read_iter for all proc "seq files" so that splice works on them. Suggested-by: Linus Torvalds Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index f81327673f49..b84663252add 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -590,7 +590,7 @@ static int proc_seq_release(struct inode *inode, struct file *file) static const struct proc_ops proc_seq_ops = { /* not permanent -- can call into arbitrary seq_operations */ .proc_open = proc_seq_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = proc_seq_release, }; -- cgit v1.2.3 From d435c05ab0197ee302290e1cee3f2d9c9024a64f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 6 Nov 2020 15:39:50 -0500 Subject: net/sunrpc: return 0 on attempt to write to "transports" You can't write to this file because the permissions are 0444. But it sort of looked like you could do a write and it would result in a read. Then it looked like proc_sys_call_handler() just ignored it. Which is confusing. It's more clear if the "write" just returns zero. Also, the "lenp" pointer is never NULL so that check can be removed. Signed-off-by: Dan Carpenter Signed-off-by: J. Bruce Fields --- net/sunrpc/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index a18b36b5422d..5c9f5bca4d99 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -65,7 +65,7 @@ static int proc_do_xprt(struct ctl_table *table, int write, char tmpbuf[256]; size_t len; - if ((*ppos && !write) || !*lenp) { + if (write || *ppos) { *lenp = 0; return 0; } -- cgit v1.2.3 From 6f64e477830000746c1f992050fbd45c03c89429 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 5 Nov 2020 23:06:51 +0000 Subject: bpf: Update verification logic for LSM programs The current logic checks if the name of the BTF type passed in attach_btf_id starts with "bpf_lsm_", this is not sufficient as it also allows attachment to non-LSM hooks like the very function that performs this check, i.e. bpf_lsm_verify_prog. In order to ensure that this verification logic allows attachment to only LSM hooks, the LSM_HOOK definitions in lsm_hook_defs.h are used to generate a BTF_ID set. Upon verification, the attach_btf_id of the program being attached is checked for presence in this set. Fixes: 9e4e01dfd325 ("bpf: lsm: Implement attach, detach and execution") Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201105230651.2621917-1-kpsingh@chromium.org --- kernel/bpf/bpf_lsm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 78ea8a7bd27f..56cc5a915f67 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -13,6 +13,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -26,7 +27,11 @@ noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ #include #undef LSM_HOOK -#define BPF_LSM_SYM_PREFX "bpf_lsm_" +#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +BTF_SET_START(bpf_lsm_hooks) +#include +#undef LSM_HOOK +BTF_SET_END(bpf_lsm_hooks) int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) @@ -37,8 +42,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EINVAL; } - if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name, - sizeof(BPF_LSM_SYM_PREFX) - 1)) { + if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) { bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", prog->aux->attach_btf_id, prog->aux->attach_func_name); return -EINVAL; -- cgit v1.2.3 From 174fe5ba2d1ea0d6c5ab2a7d4aa058d6d497ae4d Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 29 Oct 2020 23:46:36 +0800 Subject: ext4: correctly report "not supported" for {usr,grp}jquota when !CONFIG_QUOTA The macro MOPT_Q is used to indicates the mount option is related to quota stuff and is defined to be MOPT_NOSUPPORT when CONFIG_QUOTA is disabled. Normally the quota options are handled explicitly, so it didn't matter that the MOPT_STRING flag was missing, even though the usrjquota and grpjquota mount options take a string argument. It's important that's present in the !CONFIG_QUOTA case, since without MOPT_STRING, the mount option matcher will match usrjquota= followed by an integer, and will otherwise skip the table entry, and so "mount option not supported" error message is never reported. [ Fixed up the commit description to better explain why the fix works. --TYT ] Fixes: 26092bf52478 ("ext4: use a table-driven handler for mount options") Signed-off-by: Kaixu Xia Link: https://lore.kernel.org/r/1603986396-28917-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ef4734b40e2a..5dbbd48923c1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2027,8 +2027,8 @@ static const struct mount_opts { {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, - {Opt_usrjquota, 0, MOPT_Q}, - {Opt_grpjquota, 0, MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, + {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, {Opt_offusrjquota, 0, MOPT_Q}, {Opt_offgrpjquota, 0, MOPT_Q}, {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, -- cgit v1.2.3 From a0650046d31d3ca92e7fb41ae5c667ed9250a2fc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 30 Oct 2020 10:24:35 +0800 Subject: MAINTAINERS: add missing file in ext4 entry include/trace/events/ext4.h belongs to ext4 module, add the file path into ext4 entry in MAINTAINERS. Signed-off-by: Chao Yu Link: https://lore.kernel.org/r/20201030022435.1136-1-yuchao0@huawei.com Signed-off-by: Theodore Ts'o --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b516bb34a8d5..025d16ae0767 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6614,6 +6614,7 @@ Q: http://patchwork.ozlabs.org/project/linux-ext4/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git F: Documentation/filesystems/ext4/ F: fs/ext4/ +F: include/trace/events/ext4.h Extended Verification Module (EVM) M: Mimi Zohar -- cgit v1.2.3 From e121bd48b9eb8e3b9104d3d5d08fdf88e9ca0f97 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 30 Oct 2020 14:46:20 +0300 Subject: ext4: silence an uninitialized variable warning Smatch complains that "i" can be uninitialized if we don't enter the loop. I don't know if it's possible but we may as well silence this warning. [ Initialize i to sb->s_blocksize instead of 0. The only way the for loop could be skipped entirely is the in-memory data structures, in particular the bh->b_data for the on-disk superblock has gotten corrupted enough that calculated value of group is >= to ext4_get_groups_count(sb). In that case, we want to exit immediately without allocating a block. -- TYT ] Fixes: 8016e29f4362 ("ext4: fast commit recovery path") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20201030114620.GB3251003@mwanda Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 85abbfb98cbe..f067e83f149e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5167,7 +5167,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, struct super_block *sb = ar->inode->i_sb; ext4_group_t group; ext4_grpblk_t blkoff; - int i; + int i = sb->s_blocksize; ext4_fsblk_t goal, block; struct ext4_super_block *es = EXT4_SB(sb)->s_es; -- cgit v1.2.3 From 7067b2619017d51e71686ca9756b454de0e5826a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 3 Nov 2020 10:29:02 +0800 Subject: ext4: unlock xattr_sem properly in ext4_inline_data_truncate() It takes xattr_sem to check inline data again but without unlock it in case not have. So unlock it before return. Fixes: aef1c8513c1f ("ext4: let ext4_truncate handle inline data correctly") Reported-by: Dan Carpenter Cc: Tao Ma Signed-off-by: Joseph Qi Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/1604370542-124630-1-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index caa51473207d..b41512d1badc 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1880,6 +1880,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { + ext4_write_unlock_xattr(inode, &no_expand); *has_inline = 0; ext4_journal_stop(handle); return 0; -- cgit v1.2.3 From a44ad6835da52fdf4df2e482f45a167336555121 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:50 -0800 Subject: ext4: describe fast_commit feature flags Fast commit feature has flags in the file system as well in JBD2. The meaning of fast commit feature flags can get confusing. Update docs and code to add more documentation about it. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20201106035911.1942128-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/journal.rst | 6 ++++++ Documentation/filesystems/ext4/super.rst | 7 +++++++ fs/ext4/ext4.h | 7 +++++++ 3 files changed, 20 insertions(+) diff --git a/Documentation/filesystems/ext4/journal.rst b/Documentation/filesystems/ext4/journal.rst index 805a1e9ea3a5..849d5b119eb8 100644 --- a/Documentation/filesystems/ext4/journal.rst +++ b/Documentation/filesystems/ext4/journal.rst @@ -256,6 +256,10 @@ which is 1024 bytes long: - s\_padding2 - * - 0x54 + - \_\_be32 + - s\_num\_fc\_blocks + - Number of fast commit blocks in the journal. + * - 0x58 - \_\_u32 - s\_padding[42] - @@ -310,6 +314,8 @@ The journal incompat features are any combination of the following: - This journal uses v3 of the checksum on-disk format. This is the same as v2, but the journal block tag size is fixed regardless of the size of block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3) + * - 0x20 + - Journal has fast commit blocks. (JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT) .. _jbd2_checksum_type: diff --git a/Documentation/filesystems/ext4/super.rst b/Documentation/filesystems/ext4/super.rst index 93e55d7c1d40..2eb1ab20498d 100644 --- a/Documentation/filesystems/ext4/super.rst +++ b/Documentation/filesystems/ext4/super.rst @@ -596,6 +596,13 @@ following: - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs points to the two block groups that contain backup superblocks (COMPAT\_SPARSE\_SUPER2). + * - 0x400 + - Fast commits supported. Although fast commits blocks are + backward incompatible, fast commit blocks are not always + present in the journal. If fast commit blocks are present in + the journal, JBD2 incompat feature + (JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT) gets + set (COMPAT\_FAST\_COMMIT). .. _super_incompat: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 45fcdbf538d1..d513553ceafa 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1863,6 +1863,13 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 +/* + * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes + * incompatible only if fast commit blocks are present in the FS. Since we + * clear the journal (and thus the fast commit blocks), we don't mark FS as + * incompatible. We also have a JBD2 incompat feature, which gets set when + * there are fast commit blocks present in the journal. + */ #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 -- cgit v1.2.3 From b21ebf143af219207214c79bc217beb39c43212a Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:51 -0800 Subject: ext4: mark fc ineligible if inode gets evictied due to mem pressure If inode gets evicted due to memory pressure, we have to remove it from the fast commit list. However, that inode may have uncommitted changes that fast commits will lose. So, just fall back to full commits in this case. Also, rename the fast commit ineligiblity reason from "EXT4_FC_REASON_MEM" to "EXT4_FC_REASON_MEM_NOMEM" for better expression. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-3-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 4 ++-- fs/ext4/fast_commit.h | 2 +- fs/ext4/inode.c | 2 ++ include/trace/events/ext4.h | 6 +++--- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 8d43058386c3..48b7bc129392 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -384,7 +384,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -397,7 +397,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_MEM); + EXT4_FC_REASON_NOMEM); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 06907d485989..140fbb6af19e 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -100,7 +100,7 @@ enum { EXT4_FC_REASON_XATTR = 0, EXT4_FC_REASON_CROSS_RENAME, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, - EXT4_FC_REASON_MEM, + EXT4_FC_REASON_NOMEM, EXT4_FC_REASON_SWAP_BOOT, EXT4_FC_REASON_RESIZE, EXT4_FC_REASON_RENAME_DIR, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b96a18679a27..081b6a86b5db 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -327,6 +327,8 @@ stop_handle: ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: + if (!list_empty(&EXT4_I(inode)->i_fc_list)) + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b14314fcf732..0f899d3b09d3 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -100,7 +100,7 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B); { EXT4_FC_REASON_XATTR, "XATTR"}, \ { EXT4_FC_REASON_CROSS_RENAME, "CROSS_RENAME"}, \ { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \ - { EXT4_FC_REASON_MEM, "NO_MEM"}, \ + { EXT4_FC_REASON_NOMEM, "NO_MEM"}, \ { EXT4_FC_REASON_SWAP_BOOT, "SWAP_BOOT"}, \ { EXT4_FC_REASON_RESIZE, "RESIZE"}, \ { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ @@ -2917,13 +2917,13 @@ TRACE_EVENT(ext4_fc_stats, ), TP_printk("dev %d:%d fc ineligible reasons:\n" - "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s,%d; " + "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d; " "num_commits:%ld, ineligible: %ld, numblks: %ld", MAJOR(__entry->dev), MINOR(__entry->dev), FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME), FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE), - FC_REASON_NAME_STAT(EXT4_FC_REASON_MEM), + FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM), FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT), FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), -- cgit v1.2.3 From 5b552ad70c6197e764ffe6070089c5b355fe2d26 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:52 -0800 Subject: ext4: drop redundant calls ext4_fc_track_range ext4_fc_track_range() should only be called when blocks are added or removed from an inode. So, the only places from where we need to call this function are ext4_map_blocks(), punch hole, collapse / zero range, truncate. Remove all the other redundant calls to ths function. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-4-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 5 ----- fs/ext4/super.c | 4 ---- 2 files changed, 9 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 57cfa28919a9..94ba44785d28 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3724,7 +3724,6 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); - ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1); return err; } @@ -3796,7 +3795,6 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; - ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1); return 0; } @@ -4329,7 +4327,6 @@ got_allocated_blocks: map->m_len = ar.len; allocated = map->m_len; ext4_ext_show_leaf(inode, path); - ext4_fc_track_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1); out: ext4_ext_drop_refs(path); kfree(path); @@ -4651,8 +4648,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - ext4_fc_track_range(inode, offset >> blkbits, - (offset + len - 1) >> blkbits); ext4_fc_start_update(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5dbbd48923c1..738a6dd4957d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6560,10 +6560,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, brelse(bh); out: if (inode->i_size < off + len) { - ext4_fc_track_range(inode, - (inode->i_size > 0 ? inode->i_size - 1 : 0) - >> inode->i_sb->s_blocksize_bits, - (off + len) >> inode->i_sb->s_blocksize_bits); i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; err2 = ext4_mark_inode_dirty(handle, inode); -- cgit v1.2.3 From a80f7fcf18672ae4971a6b713b58c0d389aa99fe Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:53 -0800 Subject: ext4: fixup ext4_fc_track_* functions' signature Firstly, pass handle to all ext4_fc_track_* functions and use transaction id found in handle->h_transaction->h_tid for tracking fast commit updates. Secondly, don't pass inode to ext4_fc_track_link/create/unlink functions. inode can be found inside these functions as d_inode(dentry). However, rename path is an exeception. That's because in that case, we need inode that's not same as d_inode(dentry). To handle that, add a couple of low-level wrapper functions that take inode and dentry as arguments. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-5-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 16 +++++++++----- fs/ext4/extents.c | 2 +- fs/ext4/fast_commit.c | 48 +++++++++++++++++++++++++--------------- fs/ext4/inode.c | 10 ++++----- fs/ext4/namei.c | 61 +++++++++++++++++++++++---------------------------- 5 files changed, 74 insertions(+), 63 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d513553ceafa..c6339bd75a93 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2738,12 +2738,16 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); int ext4_fc_info_show(struct seq_file *seq, void *v); void ext4_fc_init(struct super_block *sb, journal_t *journal); void ext4_fc_init_inode(struct inode *inode); -void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); -void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_link(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_create(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_inode(struct inode *inode); +void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void __ext4_fc_track_link(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason); void ext4_fc_start_ineligible(struct super_block *sb, int reason); void ext4_fc_stop_ineligible(struct super_block *sb); @@ -3459,7 +3463,7 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, extern int ext4_ci_compare(const struct inode *parent, const struct qstr *fname, const struct qstr *entry, bool quick); -extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, +extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, struct inode *inode); extern int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 94ba44785d28..17d7096b3212 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4599,7 +4599,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - ext4_fc_track_range(inode, offset >> inode->i_sb->s_blocksize_bits, + ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits, (offset + len - 1) >> inode->i_sb->s_blocksize_bits); /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 48b7bc129392..9399e9cccb7e 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -323,13 +323,14 @@ static inline int ext4_fc_is_ineligible(struct super_block *sb) * If enqueue is set, this function enqueues the inode in fast commit list. */ static int ext4_fc_track_template( - struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool), + handle_t *handle, struct inode *inode, + int (*__fc_track_fn)(struct inode *, void *, bool), void *args, int enqueue) { - tid_t running_txn_tid; bool update = false; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + tid_t tid = 0; int ret; if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || @@ -339,15 +340,13 @@ static int ext4_fc_track_template( if (ext4_fc_is_ineligible(inode->i_sb)) return -EINVAL; - running_txn_tid = sbi->s_journal ? - sbi->s_journal->j_commit_sequence + 1 : 0; - + tid = handle->h_transaction->t_tid; mutex_lock(&ei->i_fc_lock); - if (running_txn_tid == ei->i_sync_tid) { + if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); - ei->i_sync_tid = running_txn_tid; + ei->i_sync_tid = tid; } ret = __fc_track_fn(inode, args, update); mutex_unlock(&ei->i_fc_lock); @@ -422,7 +421,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) return 0; } -void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry) +void __ext4_fc_track_unlink(handle_t *handle, + struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; @@ -430,12 +430,18 @@ void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry) args.dentry = dentry; args.op = EXT4_FC_TAG_UNLINK; - ret = ext4_fc_track_template(inode, __track_dentry_update, + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_unlink(inode, dentry, ret); } -void ext4_fc_track_link(struct inode *inode, struct dentry *dentry) +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) +{ + __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); +} + +void __ext4_fc_track_link(handle_t *handle, + struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; @@ -443,20 +449,26 @@ void ext4_fc_track_link(struct inode *inode, struct dentry *dentry) args.dentry = dentry; args.op = EXT4_FC_TAG_LINK; - ret = ext4_fc_track_template(inode, __track_dentry_update, + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_link(inode, dentry, ret); } -void ext4_fc_track_create(struct inode *inode, struct dentry *dentry) +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) +{ + __ext4_fc_track_link(handle, d_inode(dentry), dentry); +} + +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct __track_dentry_update_args args; + struct inode *inode = d_inode(dentry); int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_CREAT; - ret = ext4_fc_track_template(inode, __track_dentry_update, + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_create(inode, dentry, ret); } @@ -472,14 +484,14 @@ static int __track_inode(struct inode *inode, void *arg, bool update) return 0; } -void ext4_fc_track_inode(struct inode *inode) +void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { int ret; if (S_ISDIR(inode->i_mode)) return; - ret = ext4_fc_track_template(inode, __track_inode, NULL, 1); + ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(inode, ret); } @@ -515,7 +527,7 @@ static int __track_range(struct inode *inode, void *arg, bool update) return 0; } -void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct __track_range_args args; @@ -527,7 +539,7 @@ void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, args.start = start; args.end = end; - ret = ext4_fc_track_template(inode, __track_range, &args, 1); + ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); trace_ext4_fc_track_range(inode, start, end, ret); } @@ -1263,7 +1275,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) return 0; } - ret = __ext4_unlink(old_parent, &entry, inode); + ret = __ext4_unlink(NULL, old_parent, &entry, inode); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 081b6a86b5db..87120c4c44f3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -732,7 +732,7 @@ out_sem: if (ret) return ret; } - ext4_fc_track_range(inode, map->m_lblk, + ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); } @@ -4111,7 +4111,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) up_write(&EXT4_I(inode)->i_data_sem); } - ext4_fc_track_range(inode, first_block, stop_block); + ext4_fc_track_range(handle, inode, first_block, stop_block); if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -5444,14 +5444,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (shrink) - ext4_fc_track_range(inode, + ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, (oldsize > 0 ? oldsize - 1 : 0) >> inode->i_sb->s_blocksize_bits); else ext4_fc_track_range( - inode, + handle, inode, (oldsize > 0 ? oldsize - 1 : oldsize) >> inode->i_sb->s_blocksize_bits, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> @@ -5701,7 +5701,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, put_bh(iloc->bh); return -EIO; } - ext4_fc_track_inode(inode); + ext4_fc_track_inode(handle, inode); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f458d1d81d96..33509266f5a0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2606,7 +2606,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; - struct inode *inode, *inode_save; + struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); @@ -2624,11 +2624,9 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - inode_save = inode; - ihold(inode_save); err = ext4_add_nondir(handle, dentry, &inode); - ext4_fc_track_create(inode_save, dentry); - iput(inode_save); + if (!err) + ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); @@ -2643,7 +2641,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; - struct inode *inode, *inode_save; + struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); @@ -2660,12 +2658,9 @@ retry: if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; - inode_save = inode; - ihold(inode_save); err = ext4_add_nondir(handle, dentry, &inode); if (!err) - ext4_fc_track_create(inode_save, dentry); - iput(inode_save); + ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); @@ -2829,7 +2824,6 @@ out_clear_inode: iput(inode); goto out_retry; } - ext4_fc_track_create(inode, dentry); ext4_inc_count(dir); ext4_update_dx_flag(dir); @@ -2837,6 +2831,7 @@ out_clear_inode: if (err) goto out_clear_inode; d_instantiate_new(dentry, inode); + ext4_fc_track_create(handle, dentry); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); @@ -3171,7 +3166,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; ext4_dec_count(dir); ext4_update_dx_flag(dir); - ext4_fc_track_unlink(inode, dentry); + ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); #ifdef CONFIG_UNICODE @@ -3192,13 +3187,12 @@ end_rmdir: return retval; } -int __ext4_unlink(struct inode *dir, const struct qstr *d_name, +int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, struct inode *inode) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; - handle_t *handle = NULL; int skip_remove_dentry = 0; bh = ext4_find_entry(dir, d_name, &de, NULL); @@ -3217,14 +3211,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) skip_remove_dentry = 1; else - goto out_bh; - } - - handle = ext4_journal_start(dir, EXT4_HT_DIR, - EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) { - retval = PTR_ERR(handle); - goto out_bh; + goto out; } if (IS_DIRSYNC(dir)) @@ -3233,12 +3220,12 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, if (!skip_remove_dentry) { retval = ext4_delete_entry(handle, dir, de, bh); if (retval) - goto out_handle; + goto out; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) - goto out_handle; + goto out; } else { retval = 0; } @@ -3252,15 +3239,14 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, inode->i_ctime = current_time(inode); retval = ext4_mark_inode_dirty(handle, inode); -out_handle: - ext4_journal_stop(handle); -out_bh: +out: brelse(bh); return retval; } static int ext4_unlink(struct inode *dir, struct dentry *dentry) { + handle_t *handle; int retval; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) @@ -3278,9 +3264,16 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (retval) goto out_trace; - retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry)); + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + goto out_trace; + } + + retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); if (!retval) - ext4_fc_track_unlink(d_inode(dentry), dentry); + ext4_fc_track_unlink(handle, dentry); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -3291,6 +3284,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif + if (handle) + ext4_journal_stop(handle); out_trace: trace_ext4_unlink_exit(dentry, retval); @@ -3447,7 +3442,6 @@ retry: err = ext4_add_entry(handle, dentry, inode); if (!err) { - ext4_fc_track_link(inode, dentry); err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time @@ -3455,6 +3449,7 @@ retry: if (inode->i_nlink == 1) ext4_orphan_del(handle, inode); d_instantiate(dentry, inode); + ext4_fc_track_link(handle, dentry); } else { drop_nlink(inode); iput(inode); @@ -3915,9 +3910,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_FC_REASON_RENAME_DIR); } else { if (new.inode) - ext4_fc_track_unlink(new.inode, new.dentry); - ext4_fc_track_link(old.inode, new.dentry); - ext4_fc_track_unlink(old.inode, old.dentry); + ext4_fc_track_unlink(handle, new.dentry); + __ext4_fc_track_link(handle, old.inode, new.dentry); + __ext4_fc_track_unlink(handle, old.inode, old.dentry); } if (new.inode) { -- cgit v1.2.3 From ede7dc7fa0af619afc08995776eadb9ff3b0a711 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:54 -0800 Subject: jbd2: rename j_maxlen to j_total_len and add jbd2_journal_max_txn_bufs The on-disk superblock field sb->s_maxlen represents the total size of the journal including the fast commit area and is no more the max number of blocks available for a transaction. The maximum number of blocks available to a transaction is reduced by the number of fast commit blocks. So, this patch renames j_maxlen to j_total_len to better represent its intent. Also, it adds a function to calculate max number of bufs available for a transaction. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-6-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fsmap.c | 2 +- fs/ext4/super.c | 2 +- fs/jbd2/commit.c | 2 +- fs/jbd2/journal.c | 12 ++++++------ fs/jbd2/recovery.c | 6 +++--- fs/ocfs2/journal.c | 2 +- include/linux/jbd2.h | 9 +++++++-- 7 files changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index b232c2767534..4c2a9fe30067 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -280,7 +280,7 @@ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, /* Fabricate an rmap entry for the external log device. */ irec.fmr_physical = journal->j_blk_offset; - irec.fmr_length = journal->j_maxlen; + irec.fmr_length = journal->j_total_len; irec.fmr_owner = EXT4_FMR_OWN_LOG; irec.fmr_flags = 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 738a6dd4957d..8a6dd433bb70 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3976,7 +3976,7 @@ int ext4_calculate_overhead(struct super_block *sb) * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev) - overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); + overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index fa688e163a80..ec516490cb35 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -801,7 +801,7 @@ start_journal_io: if (first_block < journal->j_tail) freed += journal->j_last - journal->j_first; /* Update tail only if we free significant amount of space */ - if (freed < journal->j_maxlen / 4) + if (freed < jbd2_journal_get_max_txn_bufs(journal)) update_tail = 0; } J_ASSERT(commit_transaction->t_state == T_COMMIT); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0c7c42bd530f..c3c768248527 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1348,7 +1348,7 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_dev = bdev; journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; - journal->j_maxlen = len; + journal->j_total_len = len; /* We need enough buffers to write out full descriptor block. */ n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; @@ -1531,7 +1531,7 @@ static int journal_reset(journal_t *journal) journal->j_commit_sequence = journal->j_transaction_sequence - 1; journal->j_commit_request = journal->j_commit_sequence; - journal->j_max_transaction_buffers = journal->j_maxlen / 4; + journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal); /* * As a special case, if the on-disk copy is already marked as needing @@ -1792,15 +1792,15 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) - journal->j_maxlen = be32_to_cpu(sb->s_maxlen); - else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { printk(KERN_WARNING "JBD2: journal file too short\n"); goto out; } if (be32_to_cpu(sb->s_first) == 0 || - be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + be32_to_cpu(sb->s_first) >= journal->j_total_len) { printk(KERN_WARNING "JBD2: Invalid start block of journal: %u\n", be32_to_cpu(sb->s_first)); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index eb2606133cd8..dc0694fcfcd1 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -74,8 +74,8 @@ static int do_readahead(journal_t *journal, unsigned int start) /* Do up to 128K of readahead */ max = start + (128 * 1024 / journal->j_blocksize); - if (max > journal->j_maxlen) - max = journal->j_maxlen; + if (max > journal->j_total_len) + max = journal->j_total_len; /* Do the readahead itself. We'll submit MAXBUF buffer_heads at * a time to the block device IO layer. */ @@ -134,7 +134,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, *bhp = NULL; - if (offset >= journal->j_maxlen) { + if (offset >= journal->j_total_len) { printk(KERN_ERR "JBD2: corrupted journal superblock\n"); return -EFSCORRUPTED; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b9a9d69dde7e..db52e843002a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -877,7 +877,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) goto done; } - trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen); + trace_ocfs2_journal_init_maxlen(j_journal->j_total_len); *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & OCFS2_JOURNAL_DIRTY_FL); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 1d5566af48ac..e0b6b53eae64 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -988,9 +988,9 @@ struct journal_s struct block_device *j_fs_dev; /** - * @j_maxlen: Total maximum capacity of the journal region on disk. + * @j_total_len: Total maximum capacity of the journal region on disk. */ - unsigned int j_maxlen; + unsigned int j_total_len; /** * @j_reserved_credits: @@ -1624,6 +1624,11 @@ int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_release_bufs(journal_t *journal); +static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal) +{ + return (journal->j_total_len - journal->j_fc_wbufsize) / 4; +} + /* * is_journal_abort * -- cgit v1.2.3 From a1e5e465b31d6015fccb359d99053b39e5180466 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:55 -0800 Subject: ext4: clean up the JBD2 API that initializes fast commits This patch removes jbd2_fc_init() API and its related functions to simplify enabling fast commits. With this change, the number of fast commit blocks to use is solely determined by the JBD2 layer. So, we move the default value for minimum number of fast commit blocks from ext4/fast_commit.h to include/linux/jbd2.h. However, whether or not to use fast commits is determined by the file system. The file system just sets the fast commit feature using jbd2_journal_set_features(). JBD2 layer then determines how many blocks to use for fast commits (based on the value found in the JBD2 superblock). Note that the JBD2 feature flag of fast commits is just an indication that there are fast commit blocks present on disk. It doesn't tell JBD2 layer about the intent of the file system of whether to it wants to use fast commit or not. That's why, we blindly clear the fast commit flag in journal_reset() after the recovery is done. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-7-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 14 -------- fs/ext4/fast_commit.h | 3 -- fs/ext4/super.c | 8 +++++ fs/jbd2/journal.c | 96 ++++++++++++++++++++++++++++++--------------------- include/linux/jbd2.h | 2 +- 5 files changed, 65 insertions(+), 58 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 9399e9cccb7e..bab60c5d5095 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -2091,8 +2091,6 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, void ext4_fc_init(struct super_block *sb, journal_t *journal) { - int num_fc_blocks; - /* * We set replay callback even if fast commit disabled because we may * could still have fast commit blocks that need to be replayed even if @@ -2102,18 +2100,6 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; - if (!buffer_uptodate(journal->j_sb_buffer) - && ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, - true)) { - ext4_msg(sb, KERN_ERR, "I/O error on journal"); - return; - } - num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks); - if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks : - EXT4_NUM_FC_BLKS)) { - pr_warn("Error while enabling fast commits, turning off."); - ext4_clear_feature_fast_commit(sb); - } } const char *fc_ineligible_reasons[] = { diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 140fbb6af19e..1d96e0ac8138 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -3,9 +3,6 @@ #ifndef __FAST_COMMIT_H__ #define __FAST_COMMIT_H__ -/* Number of blocks in journal area to allocate for fast commits */ -#define EXT4_NUM_FC_BLKS 256 - /* Fast commit tags */ #define EXT4_FC_TAG_ADD_RANGE 0x0001 #define EXT4_FC_TAG_DEL_RANGE 0x0002 diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8a6dd433bb70..ba02d7c86fb3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4857,6 +4857,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } + if (test_opt2(sb, JOURNAL_FAST_COMMIT) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { + ext4_msg(sb, KERN_ERR, + "Failed to set fast commit journal feature"); + goto failed_mount_wq; + } + /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c3c768248527..500152f0421a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1352,19 +1352,12 @@ static journal_t *journal_init_common(struct block_device *bdev, /* We need enough buffers to write out full descriptor block. */ n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; + journal->j_fc_wbuf = NULL; journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), GFP_KERNEL); if (!journal->j_wbuf) goto err_cleanup; - if (journal->j_fc_wbufsize > 0) { - journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, - sizeof(struct buffer_head *), - GFP_KERNEL); - if (!journal->j_fc_wbuf) - goto err_cleanup; - } - bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); if (!bh) { pr_err("%s: Cannot get buffer for journal superblock\n", @@ -1378,23 +1371,11 @@ static journal_t *journal_init_common(struct block_device *bdev, err_cleanup: kfree(journal->j_wbuf); - kfree(journal->j_fc_wbuf); jbd2_journal_destroy_revoke(journal); kfree(journal); return NULL; } -int jbd2_fc_init(journal_t *journal, int num_fc_blks) -{ - journal->j_fc_wbufsize = num_fc_blks; - journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, - sizeof(struct buffer_head *), GFP_KERNEL); - if (!journal->j_fc_wbuf) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL(jbd2_fc_init); - /* jbd2_journal_init_dev and jbd2_journal_init_inode: * * Create a journal structure assigned some fixed set of disk blocks to @@ -1512,16 +1493,7 @@ static int journal_reset(journal_t *journal) } journal->j_first = first; - - if (jbd2_has_feature_fast_commit(journal) && - journal->j_fc_wbufsize > 0) { - journal->j_fc_last = last; - journal->j_last = last - journal->j_fc_wbufsize; - journal->j_fc_first = journal->j_last + 1; - journal->j_fc_off = 0; - } else { - journal->j_last = last; - } + journal->j_last = last; journal->j_head = journal->j_first; journal->j_tail = journal->j_first; @@ -1533,6 +1505,13 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal); + /* + * Now that journal recovery is done, turn fast commits off here. This + * way, if fast commit was enabled before the crash but if now FS has + * disabled it, we don't enable fast commits. + */ + jbd2_clear_feature_fast_commit(journal); + /* * As a special case, if the on-disk copy is already marked as needing * no recovery (s_start == 0), then we can safely defer the superblock @@ -1872,6 +1851,7 @@ static int load_superblock(journal_t *journal) { int err; journal_superblock_t *sb; + int num_fc_blocks; err = journal_get_superblock(journal); if (err) @@ -1883,15 +1863,17 @@ static int load_superblock(journal_t *journal) journal->j_tail = be32_to_cpu(sb->s_start); journal->j_first = be32_to_cpu(sb->s_first); journal->j_errno = be32_to_cpu(sb->s_errno); + journal->j_last = be32_to_cpu(sb->s_maxlen); - if (jbd2_has_feature_fast_commit(journal) && - journal->j_fc_wbufsize > 0) { + if (jbd2_has_feature_fast_commit(journal)) { journal->j_fc_last = be32_to_cpu(sb->s_maxlen); - journal->j_last = journal->j_fc_last - journal->j_fc_wbufsize; + num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks); + if (!num_fc_blocks) + num_fc_blocks = JBD2_MIN_FC_BLOCKS; + if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) + journal->j_last = journal->j_fc_last - num_fc_blocks; journal->j_fc_first = journal->j_last + 1; journal->j_fc_off = 0; - } else { - journal->j_last = be32_to_cpu(sb->s_maxlen); } return 0; @@ -1954,9 +1936,6 @@ int jbd2_journal_load(journal_t *journal) */ journal->j_flags &= ~JBD2_ABORT; - if (journal->j_fc_wbufsize > 0) - jbd2_journal_set_features(journal, 0, 0, - JBD2_FEATURE_INCOMPAT_FAST_COMMIT); /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ @@ -2040,8 +2019,7 @@ int jbd2_journal_destroy(journal_t *journal) jbd2_journal_destroy_revoke(journal); if (journal->j_chksum_driver) crypto_free_shash(journal->j_chksum_driver); - if (journal->j_fc_wbufsize > 0) - kfree(journal->j_fc_wbuf); + kfree(journal->j_fc_wbuf); kfree(journal->j_wbuf); kfree(journal); @@ -2116,6 +2094,37 @@ int jbd2_journal_check_available_features(journal_t *journal, unsigned long comp return 0; } +static int +jbd2_journal_initialize_fast_commit(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned long long num_fc_blks; + + num_fc_blks = be32_to_cpu(sb->s_num_fc_blks); + if (num_fc_blks == 0) + num_fc_blks = JBD2_MIN_FC_BLOCKS; + if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) + return -ENOSPC; + + /* Are we called twice? */ + WARN_ON(journal->j_fc_wbuf != NULL); + journal->j_fc_wbuf = kmalloc_array(num_fc_blks, + sizeof(struct buffer_head *), GFP_KERNEL); + if (!journal->j_fc_wbuf) + return -ENOMEM; + + journal->j_fc_wbufsize = num_fc_blks; + journal->j_fc_last = journal->j_last; + journal->j_last = journal->j_fc_last - num_fc_blks; + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + journal->j_free = journal->j_last - journal->j_first; + journal->j_max_transaction_buffers = + jbd2_journal_get_max_txn_bufs(journal); + + return 0; +} + /** * int jbd2_journal_set_features() - Mark a given journal feature in the superblock * @journal: Journal to act on. @@ -2159,6 +2168,13 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, sb = journal->j_superblock; + if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) { + if (jbd2_journal_initialize_fast_commit(journal)) { + pr_err("JBD2: Cannot enable fast commits.\n"); + return 0; + } + } + /* Load the checksum driver if necessary */ if ((journal->j_chksum_driver == NULL) && INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index e0b6b53eae64..b2caf7bbd8e5 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -68,6 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 +#define JBD2_MIN_FC_BLOCKS 256 #ifdef __KERNEL__ @@ -1614,7 +1615,6 @@ extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ -int jbd2_fc_init(journal_t *journal, int num_fc_blks); int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid); -- cgit v1.2.3 From 37e0a30e94f1aa25f16b403dfabb64e0b806de0b Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:56 -0800 Subject: jbd2: drop jbd2_fc_init documentation Now that jbd2_fc_init is dropped, drop its docs too. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-8-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- Documentation/filesystems/journalling.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/journalling.rst b/Documentation/filesystems/journalling.rst index 5a5f70b4063e..e18f90ffc6fd 100644 --- a/Documentation/filesystems/journalling.rst +++ b/Documentation/filesystems/journalling.rst @@ -136,10 +136,8 @@ Fast commits ~~~~~~~~~~~~ JBD2 to also allows you to perform file-system specific delta commits known as -fast commits. In order to use fast commits, you first need to call -:c:func:`jbd2_fc_init` and tell how many blocks at the end of journal -area should be reserved for fast commits. Along with that, you will also need -to set following callbacks that perform correspodning work: +fast commits. In order to use fast commits, you will need to set following +callbacks that perform correspodning work: `journal->j_fc_cleanup_cb`: Cleanup function called after every full commit and fast commit. -- cgit v1.2.3 From c460e5edc85a063ec9cb60addff93d00ed378701 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:57 -0800 Subject: jbd2: don't use state lock during commit path Variables journal->j_fc_off, journal->j_fc_wbuf are accessed during commit path. Since today we allow only one process to perform a fast commit, there is no need take state lock before accessing these variables. This patch removes these locks and adds comments to describe this. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-9-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 6 ------ include/linux/jbd2.h | 10 ++++++---- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 500152f0421a..778ea50fc8d1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -865,7 +865,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) int fc_off; *bh_out = NULL; - write_lock(&journal->j_state_lock); if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) { fc_off = journal->j_fc_off; @@ -874,7 +873,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) } else { ret = -EINVAL; } - write_unlock(&journal->j_state_lock); if (ret) return ret; @@ -909,9 +907,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) struct buffer_head *bh; int i, j_fc_off; - read_lock(&journal->j_state_lock); j_fc_off = journal->j_fc_off; - read_unlock(&journal->j_state_lock); /* * Wait in reverse order to minimize chances of us being woken up before @@ -939,9 +935,7 @@ int jbd2_fc_release_bufs(journal_t *journal) struct buffer_head *bh; int i, j_fc_off; - read_lock(&journal->j_state_lock); j_fc_off = journal->j_fc_off; - read_unlock(&journal->j_state_lock); /* * Wait in reverse order to minimize chances of us being woken up before diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b2caf7bbd8e5..5f0ef6380b0c 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -945,8 +945,9 @@ struct journal_s /** * @j_fc_off: * - * Number of fast commit blocks currently allocated. - * [j_state_lock]. + * Number of fast commit blocks currently allocated. Accessed only + * during fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. */ unsigned long j_fc_off; @@ -1109,8 +1110,9 @@ struct journal_s struct buffer_head **j_wbuf; /** - * @j_fc_wbuf: Array of fast commit bhs for - * jbd2_journal_commit_transaction. + * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only + * during a fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. */ struct buffer_head **j_fc_wbuf; -- cgit v1.2.3 From 0bce577bf9cae13ae32d391432d0030e3f67fc1d Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:58 -0800 Subject: jbd2: don't pass tid to jbd2_fc_end_commit_fallback() In jbd2_fc_end_commit_fallback(), we know which tid to commit. There's no need for caller to pass it. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-10-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 2 +- fs/jbd2/journal.c | 12 +++++++++--- include/linux/jbd2.h | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index bab60c5d5095..e69c580fa91e 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1143,7 +1143,7 @@ out: "Fast commit ended with blks = %d, reason = %d, subtid - %d", nblks, reason, subtid); if (reason == EXT4_FC_REASON_FC_FAILED) - return jbd2_fc_end_commit_fallback(journal, commit_tid); + return jbd2_fc_end_commit_fallback(journal); if (reason == EXT4_FC_REASON_FC_START_FAILED || reason == EXT4_FC_REASON_INELIGIBLE) return jbd2_complete_transaction(journal, commit_tid); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 778ea50fc8d1..59166e299cde 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -777,13 +777,19 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) int jbd2_fc_end_commit(journal_t *journal) { - return __jbd2_fc_end_commit(journal, 0, 0); + return __jbd2_fc_end_commit(journal, 0, false); } EXPORT_SYMBOL(jbd2_fc_end_commit); -int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid) +int jbd2_fc_end_commit_fallback(journal_t *journal) { - return __jbd2_fc_end_commit(journal, tid, 1); + tid_t tid; + + read_lock(&journal->j_state_lock); + tid = journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0; + read_unlock(&journal->j_state_lock); + return __jbd2_fc_end_commit(journal, tid, true); } EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5f0ef6380b0c..1c49fd62ff2e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1619,7 +1619,7 @@ extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); -int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid); +int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); -- cgit v1.2.3 From cc80586a57f704f806b9a1b99a21cd07e37dbedc Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:59 -0800 Subject: jbd2: add todo for a fast commit performance optimization Fast commit performance can be optimized if commit thread doesn't wait for ongoing fast commits to complete until the transaction enters T_FLUSH state. Document this optimization. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-11-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index ec516490cb35..b121d7d434c6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -450,6 +450,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) schedule(); write_lock(&journal->j_state_lock); finish_wait(&journal->j_fc_wait, &wait); + /* + * TODO: by blocking fast commits here, we are increasing + * fsync() latency slightly. Strictly speaking, we don't need + * to block fast commits until the transaction enters T_FLUSH + * state. So an optimization is possible where we block new fast + * commits here and wait for existing ones to complete + * just before we enter T_FLUSH. That way, the existing fast + * commits and this full commit can proceed parallely. + */ } write_unlock(&journal->j_state_lock); -- cgit v1.2.3 From 0ee66ddcf3c1503a9bdb3e49a7a96c6e429ddfad Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:00 -0800 Subject: jbd2: don't touch buffer state until it is filled Fast commit buffers should be filled in before toucing their state. Remove code that sets buffer state as dirty before the buffer is passed to the file system. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-12-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 59166e299cde..b5fbcd1b444c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -891,11 +891,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) if (!bh) return -ENOMEM; - lock_buffer(bh); - clear_buffer_uptodate(bh); - set_buffer_dirty(bh); - unlock_buffer(bh); journal->j_fc_wbuf[fc_off] = bh; *bh_out = bh; -- cgit v1.2.3 From 480f89d553260e7823920e687846877bebc8dca0 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:01 -0800 Subject: jbd2: don't read journal->j_commit_sequence without taking a lock Take journal state lock before reading journal->j_commit_sequence. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-13-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b5fbcd1b444c..f7ebf6ef69af 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -734,10 +734,12 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) if (!journal->j_stats.ts_tid) return -EINVAL; - if (tid <= journal->j_commit_sequence) + write_lock(&journal->j_state_lock); + if (tid <= journal->j_commit_sequence) { + write_unlock(&journal->j_state_lock); return -EALREADY; + } - write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING || (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) { DEFINE_WAIT(wait); -- cgit v1.2.3 From f6634e2609d13d7aa8852734e16300845db915d5 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:02 -0800 Subject: ext4: dedpulicate the code to wait on inode that's being committed This patch removes the deduplicates the code that implements waiting on inode that's being committed. That code is moved into a new function. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-14-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 61 +++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index e69c580fa91e..fc5a5e6a581d 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -155,6 +155,30 @@ void ext4_fc_init_inode(struct inode *inode) ei->i_fc_committed_subtid = 0; } +/* This function must be called with sbi->s_fc_lock held. */ +static void ext4_fc_wait_committing_inode(struct inode *inode) +{ + wait_queue_head_t *wq; + struct ext4_inode_info *ei = EXT4_I(inode); + +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); +} + /* * Inform Ext4's fast about start of an inode update * @@ -176,22 +200,7 @@ restart: goto out; if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - wait_queue_head_t *wq; -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); + ext4_fc_wait_committing_inode(inode); goto restart; } out: @@ -234,26 +243,10 @@ restart: } if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - wait_queue_head_t *wq; -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); + ext4_fc_wait_committing_inode(inode); goto restart; } - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + list_del_init(&ei->i_fc_list); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); } -- cgit v1.2.3 From a740762fb3b36dbdddb63ebe65b71cea3014f1c3 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:03 -0800 Subject: ext4: fix code documentatioon Add a TODO to remember fixing REQ_FUA | REQ_PREFLUSH for fast commit buffers. Also, fix a typo in top level comment in fast_commit.c Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-15-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index fc5a5e6a581d..639b2a308c7b 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -83,7 +83,7 @@ * * Atomicity of commits * -------------------- - * In order to gaurantee atomicity during the commit operation, fast commit + * In order to guarantee atomicity during the commit operation, fast commit * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail * tag contains CRC of the contents and TID of the transaction after which * this fast commit should be applied. Recovery code replays fast commit @@ -542,6 +542,7 @@ static void ext4_fc_submit_bh(struct super_block *sb) int write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; + /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */ if (test_opt(sb, BARRIER)) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); -- cgit v1.2.3 From 764b3fd31d131c4b8b5fa064aa94382091923aec Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:04 -0800 Subject: ext4: mark buf dirty before submitting fast commit buffer Mark the fast commit buffer as dirty before submission. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-16-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 639b2a308c7b..05e6e76a7663 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -546,7 +546,7 @@ static void ext4_fc_submit_bh(struct super_block *sb) if (test_opt(sb, BARRIER)) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); - clear_buffer_dirty(bh); + set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; submit_bh(REQ_OP_WRITE, write_flags, bh); -- cgit v1.2.3 From a3114fe747be42351ac1368bd3ad30f695e473a7 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:05 -0800 Subject: ext4: remove unnecessary fast commit calls from ext4_file_mmap Remove unnecessary calls to ext4_fc_start_update() and ext4_fc_stop_update() from ext4_file_mmap(). Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-17-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d85412d12e3a..80ad5ccc0288 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -761,7 +761,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; - ext4_fc_start_update(inode); file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -769,7 +768,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) } else { vma->vm_ops = &ext4_file_vm_ops; } - ext4_fc_stop_update(inode); return 0; } -- cgit v1.2.3 From 1ceecb537f72734e4315638e7a1bb62e56c86fbf Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:06 -0800 Subject: ext4: fix inode dirty check in case of fast commits In case of fast commits, determine if the inode is dirty by checking if the inode is on fast commit list. This also helps us get rid of ext4_inode_info.i_fc_committed_subtid field. Reported-by: Andrea Righi Tested-by: Andrea Righi Reviewed-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-18-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 --- fs/ext4/fast_commit.c | 3 --- fs/ext4/inode.c | 3 +-- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c6339bd75a93..2a89c5bbb61b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1028,9 +1028,6 @@ struct ext4_inode_info { * protected by sbi->s_fc_lock. */ - /* Fast commit subtid when this inode was committed */ - unsigned int i_fc_committed_subtid; - /* Start of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_start; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 05e6e76a7663..6b963e09af2c 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -152,7 +152,6 @@ void ext4_fc_init_inode(struct inode *inode) INIT_LIST_HEAD(&ei->i_fc_list); init_waitqueue_head(&ei->i_fc_wait); atomic_set(&ei->i_fc_updates, 0); - ei->i_fc_committed_subtid = 0; } /* This function must be called with sbi->s_fc_lock held. */ @@ -1037,8 +1036,6 @@ static int ext4_fc_perform_commit(journal_t *journal) if (ret) goto out; spin_lock(&sbi->s_fc_lock); - EXT4_I(inode)->i_fc_committed_subtid = - atomic_read(&sbi->s_fc_subtid); } spin_unlock(&sbi->s_fc_lock); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 87120c4c44f3..000bf70e88ed 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3312,8 +3312,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) EXT4_I(inode)->i_datasync_tid)) return false; if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) - return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) < - EXT4_I(inode)->i_fc_committed_subtid; + return !list_empty(&EXT4_I(inode)->i_fc_list); return true; } -- cgit v1.2.3 From 556e0319fbb8eee3fa19fdcc27c8bcb4af1c7211 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:07 -0800 Subject: ext4: disable fast commit with data journalling Fast commits don't work with data journalling. This patch disables the fast commit support when data journalling is turned on. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20201106035911.1942128-19-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 7 +++++++ fs/ext4/fast_commit.h | 1 + fs/ext4/super.c | 3 ++- include/trace/events/ext4.h | 6 ++++-- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6b963e09af2c..fb9b4e9d82b2 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -483,6 +483,12 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (S_ISDIR(inode->i_mode)) return; + if (ext4_should_journal_data(inode)) { + ext4_fc_mark_ineligible(inode->i_sb, + EXT4_FC_REASON_INODE_JOURNAL_DATA); + return; + } + ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(inode, ret); } @@ -2102,6 +2108,7 @@ const char *fc_ineligible_reasons[] = { "Resize", "Dir renamed", "Falloc range op", + "Data journalling", "FC Commit Failed" }; diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 1d96e0ac8138..3a6e5a1fa1b8 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -102,6 +102,7 @@ enum { EXT4_FC_REASON_RESIZE, EXT4_FC_REASON_RENAME_DIR, EXT4_FC_REASON_FALLOC_RANGE, + EXT4_FC_REASON_INODE_JOURNAL_DATA, EXT4_FC_COMMIT_FAILED, EXT4_FC_REASON_MAX }; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ba02d7c86fb3..9e2e06ed2c45 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4340,9 +4340,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #endif if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); + clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0f899d3b09d3..70ae5497b73a 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -104,7 +104,8 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B); { EXT4_FC_REASON_SWAP_BOOT, "SWAP_BOOT"}, \ { EXT4_FC_REASON_RESIZE, "RESIZE"}, \ { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ - { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}) + { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ + { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}) TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), @@ -2917,7 +2918,7 @@ TRACE_EVENT(ext4_fc_stats, ), TP_printk("dev %d:%d fc ineligible reasons:\n" - "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d; " + "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d; " "num_commits:%ld, ineligible: %ld, numblks: %ld", MAJOR(__entry->dev), MINOR(__entry->dev), FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), @@ -2928,6 +2929,7 @@ TRACE_EVENT(ext4_fc_stats, FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE), + FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA), __entry->sbi->s_fc_stats.fc_num_commits, __entry->sbi->s_fc_stats.fc_ineligible_commits, __entry->sbi->s_fc_stats.fc_numblks) -- cgit v1.2.3 From da0c5d2695265962f20099737348fcb3ff524d0f Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:08 -0800 Subject: ext4: issue fsdev cache flush before starting fast commit If the journal dev is different from fsdev, issue a cache flush before committing fast commit blocks to disk. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20201106035911.1942128-20-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index fb9b4e9d82b2..ebe5f423f8f2 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1007,6 +1007,13 @@ static int ext4_fc_perform_commit(journal_t *journal) if (ret) return ret; + /* + * If file system device is different from journal device, issue a cache + * flush before we start writing fast commit blocks. + */ + if (journal->j_fs_dev != journal->j_dev) + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); + blk_start_plug(&plug); if (sbi->s_fc_bytes == 0) { /* -- cgit v1.2.3 From 9b5f6c9b83d912c63ef9fb486a052be79b06f8b0 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:09 -0800 Subject: ext4: make s_mount_flags modifications atomic Fast commit file system states are recorded in sbi->s_mount_flags. Fast commit expects these bit manipulations to be atomic. This patch adds helpers to make those modifications atomic. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-21-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 40 +++++++++++++++++++++++++++++----------- fs/ext4/fast_commit.c | 18 +++++++++--------- fs/ext4/file.c | 4 ++-- fs/ext4/fsync.c | 2 +- fs/ext4/inode.c | 4 ++-- fs/ext4/mballoc.c | 4 ++-- fs/ext4/super.c | 14 +++++++------- 7 files changed, 52 insertions(+), 34 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2a89c5bbb61b..1b399cafb15a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1419,16 +1419,6 @@ struct ext4_super_block { #ifdef __KERNEL__ -/* - * run-time mount flags - */ -#define EXT4_MF_MNTDIR_SAMPLED 0x0001 -#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ -#define EXT4_MF_FC_INELIGIBLE 0x0004 /* Fast commit ineligible */ -#define EXT4_MF_FC_COMMITTING 0x0008 /* File system underoing a fast - * commit. - */ - #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL) #else @@ -1463,7 +1453,7 @@ struct ext4_sb_info { struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; - unsigned int s_mount_flags; + unsigned long s_mount_flags; unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; atomic64_t s_resv_clusters; @@ -1691,6 +1681,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) _v; \ }) +/* + * run-time mount flags + */ +enum { + EXT4_MF_MNTDIR_SAMPLED, + EXT4_MF_FS_ABORTED, /* Fatal error detected */ + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ + EXT4_MF_FC_COMMITTING /* File system underoing a fast + * commit. + */ +}; + +static inline void ext4_set_mount_flag(struct super_block *sb, int bit) +{ + set_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) +{ + clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline int ext4_test_mount_flag(struct super_block *sb, int bit) +{ + return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + + /* * Simulate_fail codes */ diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index ebe5f423f8f2..5cd6630ab1b9 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -261,7 +261,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; - sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE; + ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -294,14 +294,14 @@ void ext4_fc_stop_ineligible(struct super_block *sb) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE; + ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); } static inline int ext4_fc_is_ineligible(struct super_block *sb) { - return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) || - atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates); + return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || + atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); } /* @@ -349,7 +349,7 @@ static int ext4_fc_track_template( spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, - (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ? + (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); @@ -402,7 +402,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) node->fcd_name.len = dentry->d_name.len; spin_lock(&sbi->s_fc_lock); - if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else @@ -857,7 +857,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) int ret = 0; spin_lock(&sbi->s_fc_lock); - sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING; + ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { ei = list_entry(pos, struct ext4_inode_info, i_fc_list); ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); @@ -1206,8 +1206,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_STAGING]); - sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING; - sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE; + ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (full) sbi->s_fc_bytes = 0; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 80ad5ccc0288..3ed8c048fb12 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -780,13 +780,13 @@ static int ext4_sample_last_mounted(struct super_block *sb, handle_t *handle; int err; - if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED)) + if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) return 0; - sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); /* * Sample where the filesystem has been mounted and * store it in the superblock for sysadmin convenience diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 81a545fd14a3..a42ca95840f2 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -143,7 +143,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_mount_flags value */ smp_rmb(); - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED)) ret = -EROFS; goto out; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 000bf70e88ed..0d8385aea898 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2442,7 +2442,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, struct super_block *sb = inode->i_sb; if (ext4_forced_shutdown(EXT4_SB(sb)) || - EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2676,7 +2676,7 @@ static int ext4_writepages(struct address_space *mapping, * the stack trace. */ if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || - sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { ret = -EROFS; goto out_writepages; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f067e83f149e..24af9ed5c3e5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4477,7 +4477,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) return; ngroups = ext4_get_groups_count(sb); @@ -4508,7 +4508,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) return; mb_debug(sb, "Can't allocate:" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9e2e06ed2c45..7bd02c252f7f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -686,7 +686,7 @@ static void ext4_handle_error(struct super_block *sb) if (!test_opt(sb, ERRORS_CONT)) { journal_t *journal = EXT4_SB(sb)->s_journal; - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); if (journal) jbd2_journal_abort(journal, -EIO); } @@ -904,7 +904,7 @@ void __ext4_abort(struct super_block *sb, const char *function, va_end(args); if (sb_rdonly(sb) == 0) { - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); @@ -2153,7 +2153,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); return 1; case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); return 1; case Opt_i_version: sb->s_flags |= SB_I_VERSION; @@ -4778,8 +4778,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; - sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE; - sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; @@ -5881,7 +5881,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -5895,7 +5895,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { err = -EROFS; goto restore_opts; } -- cgit v1.2.3 From 87a144f09380152d28352ecbcc4c65874e7eb892 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:10 -0800 Subject: jbd2: don't start fast commit on aborted journal Fast commit should not be started if the journal is aborted. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-22-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f7ebf6ef69af..0c3d5e3b24b2 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -727,6 +727,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) { + if (unlikely(is_journal_aborted(journal))) + return -EIO; /* * Fast commits only allowed if at least one full commit has * been processed. -- cgit v1.2.3 From 99c880decf27858b5b0a57d8d811bb50226c3c12 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:11 -0800 Subject: ext4: cleanup fast commit mount options Drop no_fc mount option that disable fast commit even if it was enabled at mkfs time. Move fc_debug_force mount option under ifdef EXT4_DEBUG to annotate that this is strictly for debugging and testing purposes and should not be used in production. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-23-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7bd02c252f7f..c3b864588a0b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1716,11 +1716,10 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, - Opt_prefetch_block_bitmaps, Opt_no_fc, + Opt_prefetch_block_bitmaps, #ifdef CONFIG_EXT4_DEBUG - Opt_fc_debug_max_replay, + Opt_fc_debug_max_replay, Opt_fc_debug_force #endif - Opt_fc_debug_force }; static const match_table_t tokens = { @@ -1807,9 +1806,8 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, - {Opt_no_fc, "no_fc"}, - {Opt_fc_debug_force, "fc_debug_force"}, #ifdef CONFIG_EXT4_DEBUG + {Opt_fc_debug_force, "fc_debug_force"}, {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"}, #endif {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, @@ -2039,11 +2037,9 @@ static const struct mount_opts { {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, - {Opt_no_fc, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, - MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY}, +#ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, -#ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_max_replay, 0, MOPT_GTE0}, #endif {Opt_err, 0, 0} -- cgit v1.2.3 From fa329e27317f7f0762001b9fb1e76c387a9db25d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 6 Nov 2020 23:59:42 -0500 Subject: ext4: fix sparse warnings in fast_commit code Add missing __acquire() and __releases() annotations, and make fc_ineligible_reasons[] static, as it is not used outside of fs/ext4/fast_commit.c. Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 5cd6630ab1b9..f2033e13a273 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -156,6 +156,7 @@ void ext4_fc_init_inode(struct inode *inode) /* This function must be called with sbi->s_fc_lock held. */ static void ext4_fc_wait_committing_inode(struct inode *inode) +__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) { wait_queue_head_t *wq; struct ext4_inode_info *ei = EXT4_I(inode); @@ -911,6 +912,8 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal) /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) +__acquires(&sbi->s_fc_lock) +__releases(&sbi->s_fc_lock) { struct super_block *sb = (struct super_block *)(journal->j_private); struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2106,7 +2109,7 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) journal->j_fc_cleanup_callback = ext4_fc_cleanup; } -const char *fc_ineligible_reasons[] = { +static const char *fc_ineligible_reasons[] = { "Extended attributes changed", "Cross rename", "Journal flag changed", -- cgit v1.2.3 From 05d5233df85e9621597c5838e95235107eb624a2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 7 Nov 2020 00:00:49 -0500 Subject: jbd2: fix up sparse warnings in checkpoint code Add missing __acquires() and __releases() annotations. Also, in an "this should never happen" WARN_ON check, if it *does* actually happen, we need to release j_state_lock since this function is always supposed to release that lock. Otherwise, things will quickly grind to a halt after the WARN_ON trips. Fixes: 96f1e0974575 ("jbd2: avoid long hold times of j_state_lock...") Cc: stable@kernel.org Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 2 ++ fs/jbd2/transaction.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 263f02ad8ebf..472932b9e6bc 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -106,6 +106,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) * for a checkpoint to free up some space in the log. */ void __jbd2_log_wait_for_space(journal_t *journal) +__acquires(&journal->j_state_lock) +__releases(&journal->j_state_lock) { int nblocks, space_left; /* assert_spin_locked(&journal->j_state_lock); */ diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 43985738aa86..d54f04674e8e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -195,8 +195,10 @@ static void wait_transaction_switching(journal_t *journal) DEFINE_WAIT(wait); if (WARN_ON(!journal->j_running_transaction || - journal->j_running_transaction->t_state != T_SWITCH)) + journal->j_running_transaction->t_state != T_SWITCH)) { + read_unlock(&journal->j_state_lock); return; + } prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); read_unlock(&journal->j_state_lock); -- cgit v1.2.3 From 1aec69ae56be28b5fd3c9daead5f3840c30153c8 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 5 Nov 2020 16:27:39 -0600 Subject: x86/platform/uv: Fix missing OEM_TABLE_ID Testing shows a problem in that the OEM_TABLE_ID was missing for hubless systems. This is used to determine the APIC type (legacy or extended). Add the OEM_TABLE_ID to the early hubless processing. Fixes: 1e61f5a95f191 ("Add and decode Arch Type in UVsystab") Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201105222741.157029-2-mike.travis@hpe.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 714233cee0b5..a5794794ea59 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -366,7 +366,7 @@ static int __init early_get_arch_type(void) return ret; } -static int __init uv_set_system_type(char *_oem_id) +static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) { /* Save OEM_ID passed from ACPI MADT */ uv_stringify(sizeof(oem_id), oem_id, _oem_id); @@ -394,6 +394,9 @@ static int __init uv_set_system_type(char *_oem_id) else uv_hubless_system = 0x9; + /* Copy APIC type */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); + pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", oem_id, oem_table_id, uv_system_type, uv_hubless_system); return 0; @@ -456,7 +459,7 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; /* If not UV, return. */ - if (likely(uv_set_system_type(_oem_id) == 0)) + if (uv_set_system_type(_oem_id, _oem_table_id) == 0) return 0; /* Save and Decode OEM Table ID */ -- cgit v1.2.3 From 1aee505e0171fc38fd5ed70c7f0dcbb7398c759f Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 5 Nov 2020 16:27:40 -0600 Subject: x86/platform/uv: Remove spaces from OEM IDs Testing shows that trailing spaces caused problems with the OEM_ID and the OEM_TABLE_ID. One being that the OEM_ID would not string compare correctly. Another the OEM_ID and OEM_TABLE_ID would be concatenated in the printout. Remove any trailing spaces. Fixes: 1e61f5a95f191 ("Add and decode Arch Type in UVsystab") Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201105222741.157029-3-mike.travis@hpe.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index a5794794ea59..0f848d6dddc9 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -290,6 +290,9 @@ static void __init uv_stringify(int len, char *to, char *from) { /* Relies on 'to' being NULL chars so result will be NULL terminated */ strncpy(to, from, len-1); + + /* Trim trailing spaces */ + (void)strim(to); } /* Find UV arch type entry in UVsystab */ -- cgit v1.2.3 From 801284f9737883a2b2639bd494455a72c82fdedf Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 5 Nov 2020 16:27:41 -0600 Subject: x86/platform/uv: Recognize UV5 hubless system identifier Testing shows a problem in that UV5 hubless systems were not being recognized. Add them to the list of OEM IDs checked. Fixes: 6c7794423a998 ("Add UV5 direct references") Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201105222741.157029-4-mike.travis@hpe.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0f848d6dddc9..3115caa7d7d0 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -389,13 +389,20 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) /* (Not hubless), not a UV */ return 0; + /* Is UV hubless system */ + uv_hubless_system = 0x01; + + /* UV5 Hubless */ + if (strncmp(uv_archtype, "NSGI5", 5) == 0) + uv_hubless_system |= 0x20; + /* UV4 Hubless: CH */ - if (strncmp(uv_archtype, "NSGI4", 5) == 0) - uv_hubless_system = 0x11; + else if (strncmp(uv_archtype, "NSGI4", 5) == 0) + uv_hubless_system |= 0x10; /* UV3 Hubless: UV300/MC990X w/o hub */ else - uv_hubless_system = 0x9; + uv_hubless_system |= 0x8; /* Copy APIC type */ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); -- cgit v1.2.3 From 7bdb157cdebbf95a1cd94ed2e01b338714075d00 Mon Sep 17 00:00:00 2001 From: "kiyin(尹亮)" Date: Wed, 4 Nov 2020 08:23:22 +0300 Subject: perf/core: Fix a memory leak in perf_event_parse_addr_filter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As shown through runtime testing, the "filename" allocation is not always freed in perf_event_parse_addr_filter(). There are three possible ways that this could happen: - It could be allocated twice on subsequent iterations through the loop, - or leaked on the success path, - or on the failure path. Clean up the code flow to make it obvious that 'filename' is always freed in the reallocation path and in the two return paths as well. We rely on the fact that kfree(NULL) is NOP and filename is initialized with NULL. This fixes the leak. No other side effects expected. [ Dan Carpenter: cleaned up the code flow & added a changelog. ] [ Ingo Molnar: updated the changelog some more. ] Fixes: 375637bc5249 ("perf/core: Introduce address range filtering") Signed-off-by: "kiyin(尹亮)" Signed-off-by: Dan Carpenter Signed-off-by: Ingo Molnar Cc: "Srivatsa S. Bhat" Cc: Anthony Liguori -- kernel/events/core.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) --- kernel/events/core.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index da467e1dd49a..5a29ab09e72d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10085,6 +10085,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1; + kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; @@ -10131,16 +10132,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ ret = -EOPNOTSUPP; if (!event->ctx->task) - goto fail_free_name; + goto fail; /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &filter->path); if (ret) - goto fail_free_name; - - kfree(filename); - filename = NULL; + goto fail; ret = -EINVAL; if (!filter->path.dentry || @@ -10160,13 +10158,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (state != IF_STATE_ACTION) goto fail; + kfree(filename); kfree(orig); return 0; -fail_free_name: - kfree(filename); fail: + kfree(filename); free_filters_list(filters); kfree(orig); -- cgit v1.2.3 From cc6528bc9a0c901c83b8220a2e2617f3354d6dd9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 5 Nov 2020 15:28:42 +0100 Subject: r8169: fix potential skb double free in an error path The caller of rtl8169_tso_csum_v2() frees the skb if false is returned. eth_skb_pad() internally frees the skb on error what would result in a double free. Therefore use __skb_put_padto() directly and instruct it to not free the skb on error. Fixes: b423e9ae49d7 ("r8169: fix offloaded tx checksum for small packets.") Reported-by: Jakub Kicinski Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/f7e68191-acff-9ded-4263-c016428a8762@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 7766d73823eb..4cb43a980ce9 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4163,7 +4163,8 @@ static bool rtl8169_tso_csum_v2(struct rtl8169_private *tp, opts[1] |= transport_offset << TCPHO_SHIFT; } else { if (unlikely(skb->len < ETH_ZLEN && rtl_test_hw_pad_bug(tp))) - return !eth_skb_pad(skb); + /* eth_skb_pad would free the skb on error */ + return !__skb_put_padto(skb, ETH_ZLEN, false); } return true; -- cgit v1.2.3 From 847f0a2bfd2fe16d6afa537816b313b71f32e139 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 5 Nov 2020 18:14:47 +0100 Subject: r8169: disable hw csum for short packets on all chip versions RTL8125B has same or similar short packet hw padding bug as RTL8168evl. The main workaround has been extended accordingly, however we have to disable also hw checksumming for short packets on affected new chip versions. Instead of checking for an affected chip version let's simply disable hw checksumming for short packets in general. v2: - remove the version checks and disable short packet hw csum in general - reflect this in commit title and message Fixes: 0439297be951 ("r8169: add support for RTL8125B") Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/7fbb35f0-e244-ef65-aa55-3872d7d38698@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4cb43a980ce9..85d9c3e30c69 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4343,18 +4343,9 @@ static netdev_features_t rtl8169_features_check(struct sk_buff *skb, rtl_chip_supports_csum_v2(tp)) features &= ~NETIF_F_ALL_TSO; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->len < ETH_ZLEN) { - switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_11: - case RTL_GIGA_MAC_VER_12: - case RTL_GIGA_MAC_VER_17: - case RTL_GIGA_MAC_VER_34: - features &= ~NETIF_F_CSUM_MASK; - break; - default: - break; - } - } + /* work around hw bug on some chip versions */ + if (skb->len < ETH_ZLEN) + features &= ~NETIF_F_CSUM_MASK; if (transport_offset > TCPHO_MAX && rtl_chip_supports_csum_v2(tp)) -- cgit v1.2.3 From 4e0396c59559264442963b349ab71f66e471f84d Mon Sep 17 00:00:00 2001 From: Vadym Kochan Date: Fri, 6 Nov 2020 18:11:25 +0200 Subject: net: marvell: prestera: fix compilation with CONFIG_BRIDGE=m With CONFIG_BRIDGE=m the compilation fails: ld: drivers/net/ethernet/marvell/prestera/prestera_switchdev.o: in function `prestera_bridge_port_event': prestera_switchdev.c:(.text+0x2ebd): undefined reference to `br_vlan_enabled' in case the driver is statically enabled. Fix it by adding 'BRIDGE || BRIDGE=n' dependency. Fixes: e1189d9a5fbe ("net: marvell: prestera: Add Switchdev driver implementation") Reported-by: Randy Dunlap Signed-off-by: Vadym Kochan Acked-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/r/20201106161128.24069-1-vadym.kochan@plvision.eu Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/prestera/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/prestera/Kconfig b/drivers/net/ethernet/marvell/prestera/Kconfig index b1fcc44f566a..b6f20e2034c6 100644 --- a/drivers/net/ethernet/marvell/prestera/Kconfig +++ b/drivers/net/ethernet/marvell/prestera/Kconfig @@ -6,6 +6,7 @@ config PRESTERA tristate "Marvell Prestera Switch ASICs support" depends on NET_SWITCHDEV && VLAN_8021Q + depends on BRIDGE || BRIDGE=n select NET_DEVLINK help This driver supports Marvell Prestera Switch ASICs family. -- cgit v1.2.3 From 9f5d1c336a10c0d24e83e40b4c1b9539f7dba627 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 4 Nov 2020 16:12:44 +0100 Subject: futex: Handle transient "ownerless" rtmutex state correctly Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner(). This is one possible chain of events leading to this: Task Prio Operation T1 120 lock(F) T2 120 lock(F) -> blocks (top waiter) T3 50 (RT) lock(F) -> boosts T1 and blocks (new top waiter) XX timeout/ -> wakes T2 signal T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set) T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter and the lower priority T2 cannot steal the lock. -> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON() The comment states that this is invalid and rt_mutex_real_owner() must return a non NULL owner when the trylock failed, but in case of a queued and woken up waiter rt_mutex_real_owner() == NULL is a valid transient state. The higher priority waiter has simply not yet managed to take over the rtmutex. The BUG_ON() is therefore wrong and this is just another retry condition in fixup_pi_state_owner(). Drop the locks, so that T3 can make progress, and then try the fixup again. Gratian provided a great analysis, traces and a reproducer. The analysis is to the point, but it confused the hell out of that tglx dude who had to page in all the futex horrors again. Condensed version is above. [ tglx: Wrote comment and changelog ] Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") Reported-by: Gratian Crisan Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87a6w6x7bb.fsf@ni.com Link: https://lore.kernel.org/r/87sg9pkvf7.fsf@nanos.tec.linutronix.de --- kernel/futex.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index f8614ef4ff31..ac328874f6e5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2380,10 +2380,22 @@ retry: } /* - * Since we just failed the trylock; there must be an owner. + * The trylock just failed, so either there is an owner or + * there is a higher priority waiter than this one. */ newowner = rt_mutex_owner(&pi_state->pi_mutex); - BUG_ON(!newowner); + /* + * If the higher priority waiter has not yet taken over the + * rtmutex then newowner is NULL. We can't return here with + * that state because it's inconsistent vs. the user space + * state. So drop the locks and try again. It's a valid + * situation and not any different from the other retry + * conditions. + */ + if (unlikely(!newowner)) { + err = -EAGAIN; + goto handle_err; + } } else { WARN_ON_ONCE(argowner != current); if (oldowner == current) { -- cgit v1.2.3 From c6c4f961cb879aed67b1343bdef2087c899fdaa9 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sun, 27 Sep 2020 16:44:57 +0800 Subject: KVM: x86/mmu: fix counting of rmap entries in pte_list_add Fix an off-by-one style bug in pte_list_add() where it failed to account the last full set of SPTEs, i.e. when desc->sptes is full and desc->more is NULL. Merge the two "PTE_LIST_EXT-1" checks as part of the fix to avoid an extra comparison. Signed-off-by: Li RongQing Reviewed-by: Sean Christopherson Message-Id: <1601196297-24104-1-git-send-email-lirongqing@baidu.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1f96adff8dc4..5bb1939b65d8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -856,12 +856,14 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, } else { rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { - desc = desc->more; + while (desc->sptes[PTE_LIST_EXT-1]) { count += PTE_LIST_EXT; - } - if (desc->sptes[PTE_LIST_EXT-1]) { - desc->more = mmu_alloc_pte_list_desc(vcpu); + + if (!desc->more) { + desc->more = mmu_alloc_pte_list_desc(vcpu); + desc = desc->more; + break; + } desc = desc->more; } for (i = 0; desc->sptes[i]; ++i) -- cgit v1.2.3 From 3d20267abc789e6753fce60019bb5945fe8a74f3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 30 Sep 2020 21:20:31 -0400 Subject: KVM: Documentation: Update entry for KVM_X86_SET_MSR_FILTER It should be an accident when rebase, since we've already have section 8.25 (which is KVM_CAP_S390_DIAG318). Fix the number. Signed-off-by: Peter Xu Message-Id: <20201001012044.5151-2-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 36d5f1f3c6dd..e53e3bd99b2c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6367,7 +6367,7 @@ accesses that would usually trigger a #GP by KVM into the guest will instead get bounced to user space through the KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications. -8.25 KVM_X86_SET_MSR_FILTER +8.27 KVM_X86_SET_MSR_FILTER --------------------------- :Architectures: x86 -- cgit v1.2.3 From 177158e5b1a558a28b9ce6b27a14bea588a6f2fb Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 23 Oct 2020 14:33:46 -0400 Subject: KVM: Documentation: Update entry for KVM_CAP_ENFORCE_PV_CPUID Should be squashed into 66570e966dd9cb4f. Signed-off-by: Peter Xu Message-Id: <20201023183358.50607-3-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index e53e3bd99b2c..e00a66d72372 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6381,8 +6381,7 @@ In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to trap and emulate MSRs that are outside of the scope of KVM as well as limit the attack surface on KVM's MSR emulation code. - -8.26 KVM_CAP_ENFORCE_PV_CPUID +8.28 KVM_CAP_ENFORCE_PV_CPUID ----------------------------- Architectures: x86 -- cgit v1.2.3 From cc4cb017678aa66d3fb4501b2f7424ed28fc7f4d Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Sun, 1 Nov 2020 13:55:23 +0200 Subject: KVM: x86: use positive error values for msr emulation that causes #GP Recent introduction of the userspace msr filtering added code that uses negative error codes for cases that result in either #GP delivery to the guest, or handled by the userspace msr filtering. This breaks an assumption that a negative error code returned from the msr emulation code is a semi-fatal error which should be returned to userspace via KVM_RUN ioctl and usually kill the guest. Fix this by reusing the already existing KVM_MSR_RET_INVALID error code, and by adding a new KVM_MSR_RET_FILTERED error code for the userspace filtered msrs. Fixes: 291f35fb2c1d1 ("KVM: x86: report negative values from wrmsr emulation to userspace") Reported-by: Qian Cai Signed-off-by: Maxim Levitsky Message-Id: <20201101115523.115780-1-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 28 +++++++++++++++------------- arch/x86/kvm/x86.h | 8 +++++++- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f5ede41bf9e6..a4c59be8d566 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -255,11 +255,10 @@ static struct kmem_cache *x86_emulator_cache; /* * When called, it means the previous get/set msr reached an invalid msr. - * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want - * to fail the caller. + * Return true if we want to ignore/silent this failed msr access. */ -static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, - u64 data, bool write) +static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, + u64 data, bool write) { const char *op = write ? "wrmsr" : "rdmsr"; @@ -268,11 +267,11 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, data); /* Mask the error */ - return 0; + return true; } else { kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", op, msr, data); - return -ENOENT; + return false; } } @@ -1416,7 +1415,8 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) if (r == KVM_MSR_RET_INVALID) { /* Unconditionally clear the output for simplicity */ *data = 0; - r = kvm_msr_ignored_check(vcpu, index, 0, false); + if (kvm_msr_ignored_check(vcpu, index, 0, false)) + r = 0; } if (r) @@ -1540,7 +1540,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, struct msr_data msr; if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) - return -EPERM; + return KVM_MSR_RET_FILTERED; switch (index) { case MSR_FS_BASE: @@ -1581,7 +1581,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, int ret = __kvm_set_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) - ret = kvm_msr_ignored_check(vcpu, index, data, true); + if (kvm_msr_ignored_check(vcpu, index, data, true)) + ret = 0; return ret; } @@ -1599,7 +1600,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, int ret; if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) - return -EPERM; + return KVM_MSR_RET_FILTERED; msr.index = index; msr.host_initiated = host_initiated; @@ -1618,7 +1619,8 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, if (ret == KVM_MSR_RET_INVALID) { /* Unconditionally clear *data for simplicity */ *data = 0; - ret = kvm_msr_ignored_check(vcpu, index, 0, false); + if (kvm_msr_ignored_check(vcpu, index, 0, false)) + ret = 0; } return ret; @@ -1662,9 +1664,9 @@ static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) static u64 kvm_msr_reason(int r) { switch (r) { - case -ENOENT: + case KVM_MSR_RET_INVALID: return KVM_MSR_EXIT_REASON_UNKNOWN; - case -EPERM: + case KVM_MSR_RET_FILTERED: return KVM_MSR_EXIT_REASON_FILTER; default: return KVM_MSR_EXIT_REASON_INVAL; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 3900ab0c6004..e7ca622a468f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -376,7 +376,13 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); -#define KVM_MSR_RET_INVALID 2 +/* + * Internal error codes that are used to indicate that MSR emulation encountered + * an error that should result in #GP in the guest, unless userspace + * handles it. + */ +#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */ +#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */ #define __cr4_reserved_bits(__cpu_has, __c) \ ({ \ -- cgit v1.2.3 From 1930e5ddcead2c23567131e62c86b15efce054be Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 27 Oct 2020 16:10:41 -0700 Subject: kvm: x86: reads of restricted pv msrs should also result in #GP commit 66570e966dd9 ("kvm: x86: only provide PV features if enabled in guest's CPUID") only protects against disallowed guest writes to KVM paravirtual msrs, leaving msr reads unchecked. Fix this by enforcing KVM_CPUID_FEATURES for msr reads as well. Fixes: 66570e966dd9 ("kvm: x86: only provide PV features if enabled in guest's CPUID") Signed-off-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20201027231044.655110-4-oupton@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a4c59be8d566..d12820acc548 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3465,29 +3465,63 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + msr_info->data = vcpu->kvm->arch.wall_clock; + break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + msr_info->data = vcpu->arch.time; + break; case MSR_KVM_SYSTEM_TIME_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + msr_info->data = vcpu->arch.apf.msr_en_val; break; case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; + msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + msr_info->data = vcpu->arch.msr_kvm_poll_control; break; case MSR_IA32_P5_MC_ADDR: -- cgit v1.2.3 From 01b4f510b9f467abfc781e198e810e1ecffb782e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 27 Oct 2020 16:10:42 -0700 Subject: kvm: x86: ensure pv_cpuid.features is initialized when enabling cap Make the paravirtual cpuid enforcement mechanism idempotent to ioctl() ordering by updating pv_cpuid.features whenever userspace requests the capability. Extract this update out of kvm_update_cpuid_runtime() into a new helper function and move its other call site into kvm_vcpu_after_set_cpuid() where it more likely belongs. Fixes: 66570e966dd9 ("kvm: x86: only provide PV features if enabled in guest's CPUID") Signed-off-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20201027231044.655110-5-oupton@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 23 ++++++++++++++++------- arch/x86/kvm/cpuid.h | 1 + arch/x86/kvm/x86.c | 2 ++ 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 06a278b3701d..d50041f570e8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -90,6 +90,20 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) return 0; } +void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); + + /* + * save the feature bitmap to avoid cpuid lookup for every PV + * operation + */ + if (best) + vcpu->arch.pv_cpuid.features = best->eax; +} + void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -124,13 +138,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); - /* - * save the feature bitmap to avoid cpuid lookup for every PV - * operation - */ - if (best) - vcpu->arch.pv_cpuid.features = best->eax; - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (best) @@ -162,6 +169,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + kvm_update_pv_runtime(vcpu); + vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); kvm_mmu_reset_context(vcpu); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index bf8577947ed2..f7a6e8f83783 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -11,6 +11,7 @@ extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); +void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d12820acc548..948561183fd1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4611,6 +4611,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; + if (vcpu->arch.pv_cpuid.enforce) + kvm_update_pv_runtime(vcpu); return 0; -- cgit v1.2.3 From 1e293d1ae88cd0e2a0ad4c275f5dc2d8ae7b4387 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 27 Oct 2020 16:10:43 -0700 Subject: kvm: x86: request masterclock update any time guest uses different msr Commit 5b9bb0ebbcdc ("kvm: x86: encapsulate wrmsr(MSR_KVM_SYSTEM_TIME) emulation in helper fn", 2020-10-21) subtly changed the behavior of guest writes to MSR_KVM_SYSTEM_TIME(_NEW). Restore the previous behavior; update the masterclock any time the guest uses a different msr than before. Fixes: 5b9bb0ebbcdc ("kvm: x86: encapsulate wrmsr(MSR_KVM_SYSTEM_TIME) emulation in helper fn", 2020-10-21) Signed-off-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20201027231044.655110-6-oupton@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 948561183fd1..d9651faabe68 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1967,7 +1967,7 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, struct kvm_arch *ka = &vcpu->kvm->arch; if (vcpu->vcpu_id == 0 && !host_initiated) { - if (ka->boot_vcpu_runs_old_kvmclock && old_msr) + if (ka->boot_vcpu_runs_old_kvmclock != old_msr) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); ka->boot_vcpu_runs_old_kvmclock = old_msr; -- cgit v1.2.3 From 2cdef91cf882abc74dd2f6bfae16db782b44c6ce Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Thu, 5 Nov 2020 16:39:32 +0100 Subject: KVM: x86: handle MSR_IA32_DEBUGCTLMSR with report_ignored_msrs Windows2016 guest tries to enable LBR by setting the corresponding bits in MSR_IA32_DEBUGCTLMSR. KVM does not emulate MSR_IA32_DEBUGCTLMSR and spams the host kernel logs with error messages like: kvm [...]: vcpu1, guest rIP: 0xfffff800a8b687d3 kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop" This patch fixes this by enabling error logging only with 'report_ignored_msrs=1'. Signed-off-by: Pankaj Gupta Message-Id: <20201105153932.24316-1-pankaj.gupta.linux@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d9651faabe68..447edc0d1d5a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3065,9 +3065,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Values other than LBR and BTF are vendor-specific, thus reserved and should throw a #GP */ return 1; - } - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + } else if (report_ignored_msrs) + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); break; case 0x200 ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); -- cgit v1.2.3 From df11f7dd5834146defa448acba097e8d7703cc42 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Mon, 12 Oct 2020 12:47:13 -0700 Subject: selftests: kvm: Fix the segment descriptor layout to match the actual layout Fix the layout of 'struct desc64' to match the layout described in the SDM Vol 3, Chapter 3 "Protected-Mode Memory Management", section 3.4.5 "Segment Descriptors", Figure 3-8 "Segment Descriptor". The test added later in this series relies on this and crashes if this layout is not correct. Signed-off-by: Aaron Lewis Reviewed-by: Alexander Graf Message-Id: <20201012194716.3950330-2-aaronlewis@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/processor.h | 2 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 82b7fe16a824..0a65e7bb5249 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -59,7 +59,7 @@ struct gpr64_regs { struct desc64 { uint16_t limit0; uint16_t base0; - unsigned base1:8, s:1, type:4, dpl:2, p:1; + unsigned base1:8, type:4, s:1, dpl:2, p:1; unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8; uint32_t base3; uint32_t zero1; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index f6eb34eaa0d2..1ccf6c9b3476 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -392,11 +392,12 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) desc->limit0 = segp->limit & 0xFFFF; desc->base0 = segp->base & 0xFFFF; desc->base1 = segp->base >> 16; - desc->s = segp->s; desc->type = segp->type; + desc->s = segp->s; desc->dpl = segp->dpl; desc->p = segp->present; desc->limit1 = segp->limit >> 16; + desc->avl = segp->avl; desc->l = segp->l; desc->db = segp->db; desc->g = segp->g; -- cgit v1.2.3 From 85f2a4320ef27ce74b9da0631460561028c48756 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Mon, 12 Oct 2020 12:47:14 -0700 Subject: selftests: kvm: Clear uc so UCALL_NONE is being properly reported Ensure the out value 'uc' in get_ucall() is properly reporting UCALL_NONE if the call fails. The return value will be correctly reported, however, the out parameter 'uc' will not be. Clear the struct to ensure the correct value is being reported in the out parameter. Signed-off-by: Aaron Lewis Reviewed-by: Andrew Jones Reviewed-by: Alexander Graf Message-Id: <20201012194716.3950330-3-aaronlewis@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/aarch64/ucall.c | 3 +++ tools/testing/selftests/kvm/lib/s390x/ucall.c | 3 +++ tools/testing/selftests/kvm/lib/x86_64/ucall.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index c8e0ec20d3bf..2f37b90ee1a9 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -94,6 +94,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) struct kvm_run *run = vcpu_state(vm, vcpu_id); struct ucall ucall = {}; + if (uc) + memset(uc, 0, sizeof(*uc)); + if (run->exit_reason == KVM_EXIT_MMIO && run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) { vm_vaddr_t gva; diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index fd589dc9bfab..9d3b0f15249a 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -38,6 +38,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) struct kvm_run *run = vcpu_state(vm, vcpu_id); struct ucall ucall = {}; + if (uc) + memset(uc, 0, sizeof(*uc)); + if (run->exit_reason == KVM_EXIT_S390_SIEIC && run->s390_sieic.icptcode == 4 && (run->s390_sieic.ipa >> 8) == 0x83 && /* 0x83 means DIAGNOSE */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index da4d89ad5419..a3489973e290 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -40,6 +40,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) struct kvm_run *run = vcpu_state(vm, vcpu_id); struct ucall ucall = {}; + if (uc) + memset(uc, 0, sizeof(*uc)); + if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) { struct kvm_regs regs; -- cgit v1.2.3 From 29faeb9632012d6c3fa4aa33c3d589b9ff18b206 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Mon, 12 Oct 2020 12:47:15 -0700 Subject: selftests: kvm: Add exception handling to selftests Add the infrastructure needed to enable exception handling in selftests. This allows any of the exception and interrupt vectors to be overridden in the guest. Signed-off-by: Aaron Lewis Reviewed-by: Alexander Graf Message-Id: <20201012194716.3950330-4-aaronlewis@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 19 ++-- tools/testing/selftests/kvm/include/kvm_util.h | 2 + .../selftests/kvm/include/x86_64/processor.h | 24 +++++ .../testing/selftests/kvm/lib/aarch64/processor.c | 4 + tools/testing/selftests/kvm/lib/kvm_util.c | 3 + .../testing/selftests/kvm/lib/kvm_util_internal.h | 2 + tools/testing/selftests/kvm/lib/s390x/processor.c | 4 + tools/testing/selftests/kvm/lib/x86_64/handlers.S | 81 +++++++++++++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 114 ++++++++++++++++++++- 9 files changed, 244 insertions(+), 9 deletions(-) create mode 100644 tools/testing/selftests/kvm/lib/x86_64/handlers.S diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 30afbad36cd5..b0f1fcab79f5 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -34,7 +34,7 @@ ifeq ($(ARCH),s390) endif LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c -LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c +LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c @@ -111,14 +111,21 @@ LDFLAGS += -pthread $(no-pie-option) $(pgste-option) include ../lib.mk STATIC_LIBS := $(OUTPUT)/libkvm.a -LIBKVM_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM)) -EXTRA_CLEAN += $(LIBKVM_OBJ) $(STATIC_LIBS) cscope.* +LIBKVM_C := $(filter %.c,$(LIBKVM)) +LIBKVM_S := $(filter %.S,$(LIBKVM)) +LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C)) +LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S)) +EXTRA_CLEAN += $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(STATIC_LIBS) cscope.* + +x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)))) +$(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -x := $(shell mkdir -p $(sort $(dir $(LIBKVM_OBJ)))) -$(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c +$(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -$(OUTPUT)/libkvm.a: $(LIBKVM_OBJ) +LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) +$(OUTPUT)/libkvm.a: $(LIBKVM_OBJS) $(AR) crs $@ $^ x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS)))) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 919e161dd289..356930690bea 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -294,6 +294,8 @@ int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd); memcpy(&(g), _p, sizeof(g)); \ }) +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid); + /* Common ucalls */ enum { UCALL_NONE, diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 0a65e7bb5249..02530dc6339b 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -36,6 +36,8 @@ #define X86_CR4_SMAP (1ul << 21) #define X86_CR4_PKE (1ul << 22) +#define UNEXPECTED_VECTOR_PORT 0xfff0u + /* General Registers in 64-Bit Mode */ struct gpr64_regs { u64 rax; @@ -239,6 +241,11 @@ static inline struct desc_ptr get_idt(void) return idt; } +static inline void outl(uint16_t port, uint32_t value) +{ + __asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value)); +} + #define SET_XMM(__var, __xmm) \ asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm) @@ -338,6 +345,23 @@ uint32_t kvm_get_cpuid_max_basic(void); uint32_t kvm_get_cpuid_max_extended(void); void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); +struct ex_regs { + uint64_t rax, rcx, rdx, rbx; + uint64_t rbp, rsi, rdi; + uint64_t r8, r9, r10, r11; + uint64_t r12, r13, r14, r15; + uint64_t vector; + uint64_t error_code; + uint64_t rip; + uint64_t cs; + uint64_t rflags; +}; + +void vm_init_descriptor_tables(struct kvm_vm *vm); +void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid); +void vm_handle_exception(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)); + /* * Basic CPU control in CR0 */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 2afa6618b396..d6c32c328e9a 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -350,3 +350,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) va_end(ap); } + +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +{ +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 3327cebc1095..be230f3728d5 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1204,6 +1204,9 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) do { rc = ioctl(vcpu->fd, KVM_RUN, NULL); } while (rc == -1 && errno == EINTR); + + assert_on_unhandled_exception(vm, vcpuid); + return rc; } diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 2ef446520748..f07d383d03a1 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -50,6 +50,8 @@ struct kvm_vm { vm_paddr_t pgd; vm_vaddr_t gdt; vm_vaddr_t tss; + vm_vaddr_t idt; + vm_vaddr_t handlers; }; struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid); diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index a88c5d665725..7349bb2e1a24 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -241,3 +241,7 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr); } + +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +{ +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S new file mode 100644 index 000000000000..aaf7bc7d2ce1 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S @@ -0,0 +1,81 @@ +handle_exception: + push %r15 + push %r14 + push %r13 + push %r12 + push %r11 + push %r10 + push %r9 + push %r8 + + push %rdi + push %rsi + push %rbp + push %rbx + push %rdx + push %rcx + push %rax + mov %rsp, %rdi + + call route_exception + + pop %rax + pop %rcx + pop %rdx + pop %rbx + pop %rbp + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + /* Discard vector and error code. */ + add $16, %rsp + iretq + +/* + * Build the handle_exception wrappers which push the vector/error code on the + * stack and an array of pointers to those wrappers. + */ +.pushsection .rodata +.globl idt_handlers +idt_handlers: +.popsection + +.macro HANDLERS has_error from to + vector = \from + .rept \to - \from + 1 + .align 8 + + /* Fetch current address and append it to idt_handlers. */ + current_handler = . +.pushsection .rodata +.quad current_handler +.popsection + + .if ! \has_error + pushq $0 + .endif + pushq $vector + jmp handle_exception + vector = vector + 1 + .endr +.endm + +.global idt_handler_code +idt_handler_code: + HANDLERS has_error=0 from=0 to=7 + HANDLERS has_error=1 from=8 to=8 + HANDLERS has_error=0 from=9 to=9 + HANDLERS has_error=1 from=10 to=14 + HANDLERS has_error=0 from=15 to=16 + HANDLERS has_error=1 from=17 to=17 + HANDLERS has_error=0 from=18 to=255 + +.section .note.GNU-stack, "", %progbits diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 1ccf6c9b3476..66228297ac03 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -12,9 +12,18 @@ #include "../kvm_util_internal.h" #include "processor.h" +#ifndef NUM_INTERRUPTS +#define NUM_INTERRUPTS 256 +#endif + +#define DEFAULT_CODE_SELECTOR 0x8 +#define DEFAULT_DATA_SELECTOR 0x10 + /* Minimum physical address used for virtual translation tables. */ #define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 +vm_vaddr_t exception_handlers; + /* Virtual translation table structure declarations */ struct pageMapL4Entry { uint64_t present:1; @@ -557,9 +566,9 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); - kvm_seg_set_kernel_code_64bit(vm, 0x8, &sregs.cs); - kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.ds); - kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.es); + kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); + kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); + kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot); break; @@ -1119,3 +1128,102 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) *va_bits = (entry->eax >> 8) & 0xff; } } + +struct idt_entry { + uint16_t offset0; + uint16_t selector; + uint16_t ist : 3; + uint16_t : 5; + uint16_t type : 4; + uint16_t : 1; + uint16_t dpl : 2; + uint16_t p : 1; + uint16_t offset1; + uint32_t offset2; uint32_t reserved; +}; + +static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, + int dpl, unsigned short selector) +{ + struct idt_entry *base = + (struct idt_entry *)addr_gva2hva(vm, vm->idt); + struct idt_entry *e = &base[vector]; + + memset(e, 0, sizeof(*e)); + e->offset0 = addr; + e->selector = selector; + e->ist = 0; + e->type = 14; + e->dpl = dpl; + e->p = 1; + e->offset1 = addr >> 16; + e->offset2 = addr >> 32; +} + +void kvm_exit_unexpected_vector(uint32_t value) +{ + outl(UNEXPECTED_VECTOR_PORT, value); +} + +void route_exception(struct ex_regs *regs) +{ + typedef void(*handler)(struct ex_regs *); + handler *handlers = (handler *)exception_handlers; + + if (handlers && handlers[regs->vector]) { + handlers[regs->vector](regs); + return; + } + + kvm_exit_unexpected_vector(regs->vector); +} + +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + extern void *idt_handlers; + int i; + + vm->idt = vm_vaddr_alloc(vm, getpagesize(), 0x2000, 0, 0); + vm->handlers = vm_vaddr_alloc(vm, 256 * sizeof(void *), 0x2000, 0, 0); + /* Handlers have the same address in both address spaces.*/ + for (i = 0; i < NUM_INTERRUPTS; i++) + set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, + DEFAULT_CODE_SELECTOR); +} + +void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct kvm_sregs sregs; + + vcpu_sregs_get(vm, vcpuid, &sregs); + sregs.idt.base = vm->idt; + sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; + sregs.gdt.base = vm->gdt; + sregs.gdt.limit = getpagesize() - 1; + kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); + vcpu_sregs_set(vm, vcpuid, &sregs); + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; +} + +void vm_handle_exception(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); + + handlers[vector] = (vm_vaddr_t)handler; +} + +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +{ + if (vcpu_state(vm, vcpuid)->exit_reason == KVM_EXIT_IO + && vcpu_state(vm, vcpuid)->io.port == UNEXPECTED_VECTOR_PORT + && vcpu_state(vm, vcpuid)->io.size == 4) { + /* Grab pointer to io data */ + uint32_t *data = (void *)vcpu_state(vm, vcpuid) + + vcpu_state(vm, vcpuid)->io.data_offset; + + TEST_ASSERT(false, + "Unexpected vectored event in guest (vector:0x%x)", + *data); + } +} -- cgit v1.2.3 From ac4a4d6de22e674cd6e3fe57199a15383496aad2 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 27 Oct 2020 16:10:44 -0700 Subject: selftests: kvm: test enforcement of paravirtual cpuid features Add a set of tests that ensure the guest cannot access paravirtual msrs and hypercalls that have been disabled in the KVM_CPUID_FEATURES leaf. Expect a #GP in the case of msr accesses and -KVM_ENOSYS from hypercalls. Cc: Jim Mattson Signed-off-by: Oliver Upton Reviewed-by: Peter Shier Reviewed-by: Aaron Lewis Message-Id: <20201027231044.655110-7-oupton@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/include/kvm_util.h | 3 + .../selftests/kvm/include/x86_64/processor.h | 12 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 28 +++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 29 +++ tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 234 +++++++++++++++++++++ 7 files changed, 308 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/kvm_pv_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index d2c2d6205008..22973aa75eda 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -5,6 +5,7 @@ /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs /x86_64/evmcs_test +/x86_64/kvm_pv_test /x86_64/hyperv_cpuid /x86_64/mmio_warning_test /x86_64/platform_info_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index b0f1fcab79f5..44ca6badb544 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -41,6 +41,7 @@ LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid +TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 356930690bea..77f53dbc34ff 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -63,6 +63,9 @@ enum vm_mem_backing_src_type { int kvm_check_cap(long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); +int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_enable_cap *cap); +void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 02530dc6339b..8e61340b3911 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -362,6 +362,18 @@ void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid); void vm_handle_exception(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); +/* + * set_cpuid() - overwrites a matching cpuid entry with the provided value. + * matches based on ent->function && ent->index. returns true + * if a match was found and successfully overwritten. + * @cpuid: the kvm cpuid list to modify. + * @ent: cpuid entry to insert + */ +bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent); + +uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, + uint64_t a3); + /* * Basic CPU control in CR0 */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index be230f3728d5..91e547031d8c 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -86,6 +86,34 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) return ret; } +/* VCPU Enable Capability + * + * Input Args: + * vm - Virtual Machine + * vcpu_id - VCPU + * cap - Capability + * + * Output Args: None + * + * Return: On success, 0. On failure a TEST_ASSERT failure is produced. + * + * Enables a capability (KVM_CAP_*) on the VCPU. + */ +int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_enable_cap *cap) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpu_id); + int r; + + TEST_ASSERT(vcpu, "cannot find vcpu %d", vcpu_id); + + r = ioctl(vcpu->fd, KVM_ENABLE_CAP, cap); + TEST_ASSERT(!r, "KVM_ENABLE_CAP vCPU ioctl failed,\n" + " rc: %i, errno: %i", r, errno); + + return r; +} + static void vm_open(struct kvm_vm *vm, int perm) { vm->kvm_fd = open(KVM_DEV_PATH, perm); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 66228297ac03..d10c5c05bdf0 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1227,3 +1227,32 @@ void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) *data); } } + +bool set_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *ent) +{ + int i; + + for (i = 0; i < cpuid->nent; i++) { + struct kvm_cpuid_entry2 *cur = &cpuid->entries[i]; + + if (cur->function != ent->function || cur->index != ent->index) + continue; + + memcpy(cur, ent, sizeof(struct kvm_cpuid_entry2)); + return true; + } + + return false; +} + +uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, + uint64_t a3) +{ + uint64_t r; + + asm volatile("vmcall" + : "=a"(r) + : "b"(a0), "c"(a1), "d"(a2), "S"(a3)); + return r; +} diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c new file mode 100644 index 000000000000..b10a27485bad --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020, Google LLC. + * + * Tests for KVM paravirtual feature disablement + */ +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +extern unsigned char rdmsr_start; +extern unsigned char rdmsr_end; + +static u64 do_rdmsr(u32 idx) +{ + u32 lo, hi; + + asm volatile("rdmsr_start: rdmsr;" + "rdmsr_end:" + : "=a"(lo), "=c"(hi) + : "c"(idx)); + + return (((u64) hi) << 32) | lo; +} + +extern unsigned char wrmsr_start; +extern unsigned char wrmsr_end; + +static void do_wrmsr(u32 idx, u64 val) +{ + u32 lo, hi; + + lo = val; + hi = val >> 32; + + asm volatile("wrmsr_start: wrmsr;" + "wrmsr_end:" + : : "a"(lo), "c"(idx), "d"(hi)); +} + +static int nr_gp; + +static void guest_gp_handler(struct ex_regs *regs) +{ + unsigned char *rip = (unsigned char *)regs->rip; + bool r, w; + + r = rip == &rdmsr_start; + w = rip == &wrmsr_start; + GUEST_ASSERT(r || w); + + nr_gp++; + + if (r) + regs->rip = (uint64_t)&rdmsr_end; + else + regs->rip = (uint64_t)&wrmsr_end; +} + +struct msr_data { + uint32_t idx; + const char *name; +}; + +#define TEST_MSR(msr) { .idx = msr, .name = #msr } +#define UCALL_PR_MSR 0xdeadbeef +#define PR_MSR(msr) ucall(UCALL_PR_MSR, 1, msr) + +/* + * KVM paravirtual msrs to test. Expect a #GP if any of these msrs are read or + * written, as the KVM_CPUID_FEATURES leaf is cleared. + */ +static struct msr_data msrs_to_test[] = { + TEST_MSR(MSR_KVM_SYSTEM_TIME), + TEST_MSR(MSR_KVM_SYSTEM_TIME_NEW), + TEST_MSR(MSR_KVM_WALL_CLOCK), + TEST_MSR(MSR_KVM_WALL_CLOCK_NEW), + TEST_MSR(MSR_KVM_ASYNC_PF_EN), + TEST_MSR(MSR_KVM_STEAL_TIME), + TEST_MSR(MSR_KVM_PV_EOI_EN), + TEST_MSR(MSR_KVM_POLL_CONTROL), + TEST_MSR(MSR_KVM_ASYNC_PF_INT), + TEST_MSR(MSR_KVM_ASYNC_PF_ACK), +}; + +static void test_msr(struct msr_data *msr) +{ + PR_MSR(msr); + do_rdmsr(msr->idx); + GUEST_ASSERT(READ_ONCE(nr_gp) == 1); + + nr_gp = 0; + do_wrmsr(msr->idx, 0); + GUEST_ASSERT(READ_ONCE(nr_gp) == 1); + nr_gp = 0; +} + +struct hcall_data { + uint64_t nr; + const char *name; +}; + +#define TEST_HCALL(hc) { .nr = hc, .name = #hc } +#define UCALL_PR_HCALL 0xdeadc0de +#define PR_HCALL(hc) ucall(UCALL_PR_HCALL, 1, hc) + +/* + * KVM hypercalls to test. Expect -KVM_ENOSYS when called, as the corresponding + * features have been cleared in KVM_CPUID_FEATURES. + */ +static struct hcall_data hcalls_to_test[] = { + TEST_HCALL(KVM_HC_KICK_CPU), + TEST_HCALL(KVM_HC_SEND_IPI), + TEST_HCALL(KVM_HC_SCHED_YIELD), +}; + +static void test_hcall(struct hcall_data *hc) +{ + uint64_t r; + + PR_HCALL(hc); + r = kvm_hypercall(hc->nr, 0, 0, 0, 0); + GUEST_ASSERT(r == -KVM_ENOSYS); +} + +static void guest_main(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(msrs_to_test); i++) { + test_msr(&msrs_to_test[i]); + } + + for (i = 0; i < ARRAY_SIZE(hcalls_to_test); i++) { + test_hcall(&hcalls_to_test[i]); + } + + GUEST_DONE(); +} + +static void clear_kvm_cpuid_features(struct kvm_cpuid2 *cpuid) +{ + struct kvm_cpuid_entry2 ent = {0}; + + ent.function = KVM_CPUID_FEATURES; + TEST_ASSERT(set_cpuid(cpuid, &ent), + "failed to clear KVM_CPUID_FEATURES leaf"); +} + +static void pr_msr(struct ucall *uc) +{ + struct msr_data *msr = (struct msr_data *)uc->args[0]; + + pr_info("testing msr: %s (%#x)\n", msr->name, msr->idx); +} + +static void pr_hcall(struct ucall *uc) +{ + struct hcall_data *hc = (struct hcall_data *)uc->args[0]; + + pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr); +} + +static void handle_abort(struct ucall *uc) +{ + TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0], + __FILE__, uc->args[1]); +} + +#define VCPU_ID 0 + +static void enter_guest(struct kvm_vm *vm) +{ + struct kvm_run *run; + struct ucall uc; + int r; + + run = vcpu_state(vm, VCPU_ID); + + while (true) { + r = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "unexpected exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_PR_MSR: + pr_msr(&uc); + break; + case UCALL_PR_HCALL: + pr_hcall(&uc); + break; + case UCALL_ABORT: + handle_abort(&uc); + return; + case UCALL_DONE: + return; + } + } +} + +int main(void) +{ + struct kvm_enable_cap cap = {0}; + struct kvm_cpuid2 *best; + struct kvm_vm *vm; + + if (!kvm_check_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)) { + pr_info("will skip kvm paravirt restriction tests.\n"); + return 0; + } + + vm = vm_create_default(VCPU_ID, 0, guest_main); + + cap.cap = KVM_CAP_ENFORCE_PV_FEATURE_CPUID; + cap.args[0] = 1; + vcpu_enable_cap(vm, VCPU_ID, &cap); + + best = kvm_get_supported_cpuid(); + clear_kvm_cpuid_features(best); + vcpu_set_cpuid(vm, VCPU_ID, best); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_handle_exception(vm, GP_VECTOR, guest_gp_handler); + + enter_guest(vm); + kvm_vm_free(vm); +} -- cgit v1.2.3 From fd02029a9e019e941835e110651486e2d77d3f84 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Thu, 29 Oct 2020 21:17:01 +0100 Subject: KVM: selftests: Add aarch64 get-reg-list test Check for KVM_GET_REG_LIST regressions. The blessed list was created by running on v4.15 with the --core-reg-fixup option. The following script was also used in order to annotate system registers with their names when possible. When new system registers are added the names can just be added manually using the same grep. while read reg; do if [[ ! $reg =~ ARM64_SYS_REG ]]; then printf "\t$reg\n" continue fi encoding=$(echo "$reg" | sed "s/ARM64_SYS_REG(//;s/),//") if ! name=$(grep "$encoding" ../../../../arch/arm64/include/asm/sysreg.h); then printf "\t$reg\n" continue fi name=$(echo "$name" | sed "s/.*SYS_//;s/[\t ]*sys_reg($encoding)$//") printf "\t$reg\t/* $name */\n" done < <(aarch64/get-reg-list --core-reg-fixup --list) Signed-off-by: Andrew Jones Message-Id: <20201029201703.102716-3-drjones@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/aarch64/get-reg-list.c | 671 +++++++++++++++++++++ tools/testing/selftests/kvm/include/kvm_util.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 29 + 5 files changed, 703 insertions(+) create mode 100644 tools/testing/selftests/kvm/aarch64/get-reg-list.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 22973aa75eda..2034765862a2 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +/aarch64/get-reg-list /s390x/memop /s390x/resets /s390x/sync_regs_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 44ca6badb544..771455ae4a1b 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -66,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time +TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list TEST_GEN_PROGS_aarch64 += clear_dirty_log_test TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c new file mode 100644 index 000000000000..3ff097f6886e --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -0,0 +1,671 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check for KVM_GET_REG_LIST regressions. + * + * Copyright (C) 2020, Red Hat, Inc. + * + * When attempting to migrate from a host with an older kernel to a host + * with a newer kernel we allow the newer kernel on the destination to + * list new registers with get-reg-list. We assume they'll be unused, at + * least until the guest reboots, and so they're relatively harmless. + * However, if the destination host with the newer kernel is missing + * registers which the source host with the older kernel has, then that's + * a regression in get-reg-list. This test checks for that regression by + * checking the current list against a blessed list. We should never have + * missing registers, but if new ones appear then they can probably be + * added to the blessed list. A completely new blessed list can be created + * by running the test with the --list command line argument. + * + * Note, the blessed list should be created from the oldest possible + * kernel. We can't go older than v4.15, though, because that's the first + * release to expose the ID system registers in KVM_GET_REG_LIST, see + * commit 93390c0a1b20 ("arm64: KVM: Hide unsupported AArch64 CPU features + * from guests"). Also, one must use the --core-reg-fixup command line + * option when running on an older kernel that doesn't include df205b5c6328 + * ("KVM: arm64: Filter out invalid core register IDs in KVM_GET_REG_LIST") + */ +#include +#include +#include +#include "kvm_util.h" +#include "test_util.h" + +#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK) + +#define for_each_reg(i) \ + for ((i) = 0; (i) < reg_list->n; ++(i)) + +#define for_each_missing_reg(i) \ + for ((i) = 0; (i) < blessed_n; ++(i)) \ + if (!find_reg(reg_list->reg, reg_list->n, blessed_reg[i])) + +#define for_each_new_reg(i) \ + for ((i) = 0; (i) < reg_list->n; ++(i)) \ + if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i])) + + +static struct kvm_reg_list *reg_list; + +static __u64 blessed_reg[]; +static __u64 blessed_n; + +static bool find_reg(__u64 regs[], __u64 nr_regs, __u64 reg) +{ + int i; + + for (i = 0; i < nr_regs; ++i) + if (reg == regs[i]) + return true; + return false; +} + +static const char *str_with_index(const char *template, __u64 index) +{ + char *str, *p; + int n; + + str = strdup(template); + p = strstr(str, "##"); + n = sprintf(p, "%lld", index); + strcat(p + n, strstr(template, "##") + 2); + + return (const char *)str; +} + +#define CORE_REGS_XX_NR_WORDS 2 +#define CORE_SPSR_XX_NR_WORDS 2 +#define CORE_FPREGS_XX_NR_WORDS 4 + +static const char *core_id_to_str(__u64 id) +{ + __u64 core_off = id & ~REG_MASK, idx; + + /* + * core_off is the offset into struct kvm_regs + */ + switch (core_off) { + case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... + KVM_REG_ARM_CORE_REG(regs.regs[30]): + idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS; + TEST_ASSERT(idx < 31, "Unexpected regs.regs index: %lld", idx); + return str_with_index("KVM_REG_ARM_CORE_REG(regs.regs[##])", idx); + case KVM_REG_ARM_CORE_REG(regs.sp): + return "KVM_REG_ARM_CORE_REG(regs.sp)"; + case KVM_REG_ARM_CORE_REG(regs.pc): + return "KVM_REG_ARM_CORE_REG(regs.pc)"; + case KVM_REG_ARM_CORE_REG(regs.pstate): + return "KVM_REG_ARM_CORE_REG(regs.pstate)"; + case KVM_REG_ARM_CORE_REG(sp_el1): + return "KVM_REG_ARM_CORE_REG(sp_el1)"; + case KVM_REG_ARM_CORE_REG(elr_el1): + return "KVM_REG_ARM_CORE_REG(elr_el1)"; + case KVM_REG_ARM_CORE_REG(spsr[0]) ... + KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): + idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS; + TEST_ASSERT(idx < KVM_NR_SPSR, "Unexpected spsr index: %lld", idx); + return str_with_index("KVM_REG_ARM_CORE_REG(spsr[##])", idx); + case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... + KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): + idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS; + TEST_ASSERT(idx < 32, "Unexpected fp_regs.vregs index: %lld", idx); + return str_with_index("KVM_REG_ARM_CORE_REG(fp_regs.vregs[##])", idx); + case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): + return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)"; + case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): + return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)"; + } + + TEST_FAIL("Unknown core reg id: 0x%llx", id); + return NULL; +} + +static void print_reg(__u64 id) +{ + unsigned op0, op1, crn, crm, op2; + const char *reg_size = NULL; + + TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64, + "KVM_REG_ARM64 missing in reg id: 0x%llx", id); + + switch (id & KVM_REG_SIZE_MASK) { + case KVM_REG_SIZE_U8: + reg_size = "KVM_REG_SIZE_U8"; + break; + case KVM_REG_SIZE_U16: + reg_size = "KVM_REG_SIZE_U16"; + break; + case KVM_REG_SIZE_U32: + reg_size = "KVM_REG_SIZE_U32"; + break; + case KVM_REG_SIZE_U64: + reg_size = "KVM_REG_SIZE_U64"; + break; + case KVM_REG_SIZE_U128: + reg_size = "KVM_REG_SIZE_U128"; + break; + case KVM_REG_SIZE_U256: + reg_size = "KVM_REG_SIZE_U256"; + break; + case KVM_REG_SIZE_U512: + reg_size = "KVM_REG_SIZE_U512"; + break; + case KVM_REG_SIZE_U1024: + reg_size = "KVM_REG_SIZE_U1024"; + break; + case KVM_REG_SIZE_U2048: + reg_size = "KVM_REG_SIZE_U2048"; + break; + default: + TEST_FAIL("Unexpected reg size: 0x%llx in reg id: 0x%llx", + (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id); + } + + switch (id & KVM_REG_ARM_COPROC_MASK) { + case KVM_REG_ARM_CORE: + printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(id)); + break; + case KVM_REG_ARM_DEMUX: + TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)), + "Unexpected bits set in DEMUX reg id: 0x%llx", id); + printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n", + reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK); + break; + case KVM_REG_ARM64_SYSREG: + op0 = (id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT; + op1 = (id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT; + crn = (id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT; + crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT; + op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT; + TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2), + "Unexpected bits set in SYSREG reg id: 0x%llx", id); + printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2); + break; + case KVM_REG_ARM_FW: + TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff), + "Unexpected bits set in FW reg id: 0x%llx", id); + printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff); + break; + case KVM_REG_ARM64_SVE: + TEST_FAIL("KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", id); + break; + default: + TEST_FAIL("Unexpected coproc type: 0x%llx in reg id: 0x%llx", + (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id); + } +} + +/* + * Older kernels listed each 32-bit word of CORE registers separately. + * For 64 and 128-bit registers we need to ignore the extra words. We + * also need to fixup the sizes, because the older kernels stated all + * registers were 64-bit, even when they weren't. + */ +static void core_reg_fixup(void) +{ + struct kvm_reg_list *tmp; + __u64 id, core_off; + int i; + + tmp = calloc(1, sizeof(*tmp) + reg_list->n * sizeof(__u64)); + + for (i = 0; i < reg_list->n; ++i) { + id = reg_list->reg[i]; + + if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM_CORE) { + tmp->reg[tmp->n++] = id; + continue; + } + + core_off = id & ~REG_MASK; + + switch (core_off) { + case 0x52: case 0xd2: case 0xd6: + /* + * These offsets are pointing at padding. + * We need to ignore them too. + */ + continue; + case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... + KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): + if (core_off & 3) + continue; + id &= ~KVM_REG_SIZE_MASK; + id |= KVM_REG_SIZE_U128; + tmp->reg[tmp->n++] = id; + continue; + case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): + case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): + id &= ~KVM_REG_SIZE_MASK; + id |= KVM_REG_SIZE_U32; + tmp->reg[tmp->n++] = id; + continue; + default: + if (core_off & 1) + continue; + tmp->reg[tmp->n++] = id; + break; + } + } + + free(reg_list); + reg_list = tmp; +} + +int main(int ac, char **av) +{ + int new_regs = 0, missing_regs = 0, i; + int failed_get = 0, failed_set = 0; + bool print_list = false, fixup_core_regs = false; + struct kvm_vm *vm; + + for (i = 1; i < ac; ++i) { + if (strcmp(av[i], "--core-reg-fixup") == 0) + fixup_core_regs = true; + else if (strcmp(av[i], "--list") == 0) + print_list = true; + else + fprintf(stderr, "Ignoring unknown option: %s\n", av[i]); + } + + vm = vm_create_default(0, 0, NULL); + reg_list = vcpu_get_reg_list(vm, 0); + + if (fixup_core_regs) + core_reg_fixup(); + + if (print_list) { + putchar('\n'); + for_each_reg(i) + print_reg(reg_list->reg[i]); + putchar('\n'); + return 0; + } + + /* + * We only test that we can get the register and then write back the + * same value. Some registers may allow other values to be written + * back, but others only allow some bits to be changed, and at least + * for ID registers set will fail if the value does not exactly match + * what was returned by get. If registers that allow other values to + * be written need to have the other values tested, then we should + * create a new set of tests for those in a new independent test + * executable. + */ + for_each_reg(i) { + uint8_t addr[2048 / 8]; + struct kvm_one_reg reg = { + .id = reg_list->reg[i], + .addr = (__u64)&addr, + }; + int ret; + + ret = _vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, ®); + if (ret) { + puts("Failed to get "); + print_reg(reg.id); + putchar('\n'); + ++failed_get; + } + + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + if (ret) { + puts("Failed to set "); + print_reg(reg.id); + putchar('\n'); + ++failed_set; + } + } + + for_each_new_reg(i) + ++new_regs; + + for_each_missing_reg(i) + ++missing_regs; + + if (new_regs || missing_regs) { + printf("Number blessed registers: %5lld\n", blessed_n); + printf("Number registers: %5lld\n", reg_list->n); + } + + if (new_regs) { + printf("\nThere are %d new registers.\n" + "Consider adding them to the blessed reg " + "list with the following lines:\n\n", new_regs); + for_each_new_reg(i) + print_reg(reg_list->reg[i]); + putchar('\n'); + } + + if (missing_regs) { + printf("\nThere are %d missing registers.\n" + "The following lines are missing registers:\n\n", missing_regs); + for_each_missing_reg(i) + print_reg(blessed_reg[i]); + putchar('\n'); + } + + TEST_ASSERT(!missing_regs && !failed_get && !failed_set, + "There are %d missing registers; %d registers failed get; %d registers failed set", + missing_regs, failed_get, failed_set); + + return 0; +} + +/* + * The current blessed list was primed with the output of kernel version + * v4.15 with --core-reg-fixup and then later updated with new registers. + */ +static __u64 blessed_reg[] = { + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[5]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[6]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[7]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[8]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[9]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[10]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[11]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[12]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[13]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[14]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[15]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[16]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[17]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[18]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[19]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[20]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[21]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[22]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[23]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[24]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[25]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[26]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[27]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[28]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[29]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[30]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.sp), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pc), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pstate), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(sp_el1), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(elr_el1), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]), + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr), + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr), + KVM_REG_ARM_FW_REG(0), + KVM_REG_ARM_FW_REG(1), + KVM_REG_ARM_FW_REG(2), + ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */ + ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */ + ARM64_SYS_REG(3, 3, 14, 0, 2), + ARM64_SYS_REG(3, 0, 0, 0, 0), /* MIDR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 0, 6), /* REVIDR_EL1 */ + ARM64_SYS_REG(3, 1, 0, 0, 1), /* CLIDR_EL1 */ + ARM64_SYS_REG(3, 1, 0, 0, 7), /* AIDR_EL1 */ + ARM64_SYS_REG(3, 3, 0, 0, 1), /* CTR_EL0 */ + ARM64_SYS_REG(2, 0, 0, 0, 4), + ARM64_SYS_REG(2, 0, 0, 0, 5), + ARM64_SYS_REG(2, 0, 0, 0, 6), + ARM64_SYS_REG(2, 0, 0, 0, 7), + ARM64_SYS_REG(2, 0, 0, 1, 4), + ARM64_SYS_REG(2, 0, 0, 1, 5), + ARM64_SYS_REG(2, 0, 0, 1, 6), + ARM64_SYS_REG(2, 0, 0, 1, 7), + ARM64_SYS_REG(2, 0, 0, 2, 0), /* MDCCINT_EL1 */ + ARM64_SYS_REG(2, 0, 0, 2, 2), /* MDSCR_EL1 */ + ARM64_SYS_REG(2, 0, 0, 2, 4), + ARM64_SYS_REG(2, 0, 0, 2, 5), + ARM64_SYS_REG(2, 0, 0, 2, 6), + ARM64_SYS_REG(2, 0, 0, 2, 7), + ARM64_SYS_REG(2, 0, 0, 3, 4), + ARM64_SYS_REG(2, 0, 0, 3, 5), + ARM64_SYS_REG(2, 0, 0, 3, 6), + ARM64_SYS_REG(2, 0, 0, 3, 7), + ARM64_SYS_REG(2, 0, 0, 4, 4), + ARM64_SYS_REG(2, 0, 0, 4, 5), + ARM64_SYS_REG(2, 0, 0, 4, 6), + ARM64_SYS_REG(2, 0, 0, 4, 7), + ARM64_SYS_REG(2, 0, 0, 5, 4), + ARM64_SYS_REG(2, 0, 0, 5, 5), + ARM64_SYS_REG(2, 0, 0, 5, 6), + ARM64_SYS_REG(2, 0, 0, 5, 7), + ARM64_SYS_REG(2, 0, 0, 6, 4), + ARM64_SYS_REG(2, 0, 0, 6, 5), + ARM64_SYS_REG(2, 0, 0, 6, 6), + ARM64_SYS_REG(2, 0, 0, 6, 7), + ARM64_SYS_REG(2, 0, 0, 7, 4), + ARM64_SYS_REG(2, 0, 0, 7, 5), + ARM64_SYS_REG(2, 0, 0, 7, 6), + ARM64_SYS_REG(2, 0, 0, 7, 7), + ARM64_SYS_REG(2, 0, 0, 8, 4), + ARM64_SYS_REG(2, 0, 0, 8, 5), + ARM64_SYS_REG(2, 0, 0, 8, 6), + ARM64_SYS_REG(2, 0, 0, 8, 7), + ARM64_SYS_REG(2, 0, 0, 9, 4), + ARM64_SYS_REG(2, 0, 0, 9, 5), + ARM64_SYS_REG(2, 0, 0, 9, 6), + ARM64_SYS_REG(2, 0, 0, 9, 7), + ARM64_SYS_REG(2, 0, 0, 10, 4), + ARM64_SYS_REG(2, 0, 0, 10, 5), + ARM64_SYS_REG(2, 0, 0, 10, 6), + ARM64_SYS_REG(2, 0, 0, 10, 7), + ARM64_SYS_REG(2, 0, 0, 11, 4), + ARM64_SYS_REG(2, 0, 0, 11, 5), + ARM64_SYS_REG(2, 0, 0, 11, 6), + ARM64_SYS_REG(2, 0, 0, 11, 7), + ARM64_SYS_REG(2, 0, 0, 12, 4), + ARM64_SYS_REG(2, 0, 0, 12, 5), + ARM64_SYS_REG(2, 0, 0, 12, 6), + ARM64_SYS_REG(2, 0, 0, 12, 7), + ARM64_SYS_REG(2, 0, 0, 13, 4), + ARM64_SYS_REG(2, 0, 0, 13, 5), + ARM64_SYS_REG(2, 0, 0, 13, 6), + ARM64_SYS_REG(2, 0, 0, 13, 7), + ARM64_SYS_REG(2, 0, 0, 14, 4), + ARM64_SYS_REG(2, 0, 0, 14, 5), + ARM64_SYS_REG(2, 0, 0, 14, 6), + ARM64_SYS_REG(2, 0, 0, 14, 7), + ARM64_SYS_REG(2, 0, 0, 15, 4), + ARM64_SYS_REG(2, 0, 0, 15, 5), + ARM64_SYS_REG(2, 0, 0, 15, 6), + ARM64_SYS_REG(2, 0, 0, 15, 7), + ARM64_SYS_REG(2, 4, 0, 7, 0), /* DBGVCR32_EL2 */ + ARM64_SYS_REG(3, 0, 0, 0, 5), /* MPIDR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 0), /* ID_PFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 1), /* ID_PFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 2), /* ID_DFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 3), /* ID_AFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 4), /* ID_MMFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 5), /* ID_MMFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 6), /* ID_MMFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 7), /* ID_MMFR3_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 0), /* ID_ISAR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 1), /* ID_ISAR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 2), /* ID_ISAR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 3), /* ID_ISAR3_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 4), /* ID_ISAR4_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 5), /* ID_ISAR5_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 6), /* ID_MMFR4_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 7), /* ID_ISAR6_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 0), /* MVFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 1), /* MVFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 2), /* MVFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 3), + ARM64_SYS_REG(3, 0, 0, 3, 4), /* ID_PFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 5), /* ID_DFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 6), /* ID_MMFR5_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 7), + ARM64_SYS_REG(3, 0, 0, 4, 0), /* ID_AA64PFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 2), + ARM64_SYS_REG(3, 0, 0, 4, 3), + ARM64_SYS_REG(3, 0, 0, 4, 4), /* ID_AA64ZFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 5), + ARM64_SYS_REG(3, 0, 0, 4, 6), + ARM64_SYS_REG(3, 0, 0, 4, 7), + ARM64_SYS_REG(3, 0, 0, 5, 0), /* ID_AA64DFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 1), /* ID_AA64DFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 2), + ARM64_SYS_REG(3, 0, 0, 5, 3), + ARM64_SYS_REG(3, 0, 0, 5, 4), /* ID_AA64AFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 5), /* ID_AA64AFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 6), + ARM64_SYS_REG(3, 0, 0, 5, 7), + ARM64_SYS_REG(3, 0, 0, 6, 0), /* ID_AA64ISAR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 6, 1), /* ID_AA64ISAR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 6, 2), + ARM64_SYS_REG(3, 0, 0, 6, 3), + ARM64_SYS_REG(3, 0, 0, 6, 4), + ARM64_SYS_REG(3, 0, 0, 6, 5), + ARM64_SYS_REG(3, 0, 0, 6, 6), + ARM64_SYS_REG(3, 0, 0, 6, 7), + ARM64_SYS_REG(3, 0, 0, 7, 0), /* ID_AA64MMFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 1), /* ID_AA64MMFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 2), /* ID_AA64MMFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), + ARM64_SYS_REG(3, 0, 0, 7, 4), + ARM64_SYS_REG(3, 0, 0, 7, 5), + ARM64_SYS_REG(3, 0, 0, 7, 6), + ARM64_SYS_REG(3, 0, 0, 7, 7), + ARM64_SYS_REG(3, 0, 1, 0, 0), /* SCTLR_EL1 */ + ARM64_SYS_REG(3, 0, 1, 0, 1), /* ACTLR_EL1 */ + ARM64_SYS_REG(3, 0, 1, 0, 2), /* CPACR_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 0), /* TTBR0_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 1), /* TTBR1_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 2), /* TCR_EL1 */ + ARM64_SYS_REG(3, 0, 5, 1, 0), /* AFSR0_EL1 */ + ARM64_SYS_REG(3, 0, 5, 1, 1), /* AFSR1_EL1 */ + ARM64_SYS_REG(3, 0, 5, 2, 0), /* ESR_EL1 */ + ARM64_SYS_REG(3, 0, 6, 0, 0), /* FAR_EL1 */ + ARM64_SYS_REG(3, 0, 7, 4, 0), /* PAR_EL1 */ + ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */ + ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */ + ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */ + ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */ + ARM64_SYS_REG(3, 0, 13, 0, 1), /* CONTEXTIDR_EL1 */ + ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ + ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ + ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 9, 12, 0), /* PMCR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 1), /* PMCNTENSET_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 2), /* PMCNTENCLR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 3), /* PMOVSCLR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 4), /* PMSWINC_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 5), /* PMSELR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 13, 0), /* PMCCNTR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 14, 0), /* PMUSERENR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 14, 3), /* PMOVSSET_EL0 */ + ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ + ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ + ARM64_SYS_REG(3, 3, 14, 8, 0), + ARM64_SYS_REG(3, 3, 14, 8, 1), + ARM64_SYS_REG(3, 3, 14, 8, 2), + ARM64_SYS_REG(3, 3, 14, 8, 3), + ARM64_SYS_REG(3, 3, 14, 8, 4), + ARM64_SYS_REG(3, 3, 14, 8, 5), + ARM64_SYS_REG(3, 3, 14, 8, 6), + ARM64_SYS_REG(3, 3, 14, 8, 7), + ARM64_SYS_REG(3, 3, 14, 9, 0), + ARM64_SYS_REG(3, 3, 14, 9, 1), + ARM64_SYS_REG(3, 3, 14, 9, 2), + ARM64_SYS_REG(3, 3, 14, 9, 3), + ARM64_SYS_REG(3, 3, 14, 9, 4), + ARM64_SYS_REG(3, 3, 14, 9, 5), + ARM64_SYS_REG(3, 3, 14, 9, 6), + ARM64_SYS_REG(3, 3, 14, 9, 7), + ARM64_SYS_REG(3, 3, 14, 10, 0), + ARM64_SYS_REG(3, 3, 14, 10, 1), + ARM64_SYS_REG(3, 3, 14, 10, 2), + ARM64_SYS_REG(3, 3, 14, 10, 3), + ARM64_SYS_REG(3, 3, 14, 10, 4), + ARM64_SYS_REG(3, 3, 14, 10, 5), + ARM64_SYS_REG(3, 3, 14, 10, 6), + ARM64_SYS_REG(3, 3, 14, 10, 7), + ARM64_SYS_REG(3, 3, 14, 11, 0), + ARM64_SYS_REG(3, 3, 14, 11, 1), + ARM64_SYS_REG(3, 3, 14, 11, 2), + ARM64_SYS_REG(3, 3, 14, 11, 3), + ARM64_SYS_REG(3, 3, 14, 11, 4), + ARM64_SYS_REG(3, 3, 14, 11, 5), + ARM64_SYS_REG(3, 3, 14, 11, 6), + ARM64_SYS_REG(3, 3, 14, 12, 0), + ARM64_SYS_REG(3, 3, 14, 12, 1), + ARM64_SYS_REG(3, 3, 14, 12, 2), + ARM64_SYS_REG(3, 3, 14, 12, 3), + ARM64_SYS_REG(3, 3, 14, 12, 4), + ARM64_SYS_REG(3, 3, 14, 12, 5), + ARM64_SYS_REG(3, 3, 14, 12, 6), + ARM64_SYS_REG(3, 3, 14, 12, 7), + ARM64_SYS_REG(3, 3, 14, 13, 0), + ARM64_SYS_REG(3, 3, 14, 13, 1), + ARM64_SYS_REG(3, 3, 14, 13, 2), + ARM64_SYS_REG(3, 3, 14, 13, 3), + ARM64_SYS_REG(3, 3, 14, 13, 4), + ARM64_SYS_REG(3, 3, 14, 13, 5), + ARM64_SYS_REG(3, 3, 14, 13, 6), + ARM64_SYS_REG(3, 3, 14, 13, 7), + ARM64_SYS_REG(3, 3, 14, 14, 0), + ARM64_SYS_REG(3, 3, 14, 14, 1), + ARM64_SYS_REG(3, 3, 14, 14, 2), + ARM64_SYS_REG(3, 3, 14, 14, 3), + ARM64_SYS_REG(3, 3, 14, 14, 4), + ARM64_SYS_REG(3, 3, 14, 14, 5), + ARM64_SYS_REG(3, 3, 14, 14, 6), + ARM64_SYS_REG(3, 3, 14, 14, 7), + ARM64_SYS_REG(3, 3, 14, 15, 0), + ARM64_SYS_REG(3, 3, 14, 15, 1), + ARM64_SYS_REG(3, 3, 14, 15, 2), + ARM64_SYS_REG(3, 3, 14, 15, 3), + ARM64_SYS_REG(3, 3, 14, 15, 4), + ARM64_SYS_REG(3, 3, 14, 15, 5), + ARM64_SYS_REG(3, 3, 14, 15, 6), + ARM64_SYS_REG(3, 3, 14, 15, 7), /* PMCCFILTR_EL0 */ + ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */ + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 0, + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 1, + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 2, +}; +static __u64 blessed_n = ARRAY_SIZE(blessed_reg); diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 77f53dbc34ff..feb949a92055 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -152,6 +152,7 @@ void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_guest_debug *debug); void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_mp_state *mp_state); +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 91e547031d8c..7669cb7c86e3 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1291,6 +1291,35 @@ void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, "rc: %i errno: %i", ret, errno); } +/* + * VM VCPU Get Reg List + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: + * None + * + * Return: + * A pointer to an allocated struct kvm_reg_list + * + * Get the list of guest registers which are supported for + * KVM_GET_ONE_REG/KVM_SET_ONE_REG calls + */ +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list; + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, ®_list_n); + TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0"); + reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64)); + reg_list->n = reg_list_n.n; + vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, reg_list); + return reg_list; +} + /* * VM VCPU Regs Get * -- cgit v1.2.3 From 31d212959179015bc07f3af4e890cadd26e01ee0 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Thu, 29 Oct 2020 21:17:03 +0100 Subject: KVM: selftests: Add blessed SVE registers to get-reg-list Add support for the SVE registers to get-reg-list and create a new test, get-reg-list-sve, which tests them when running on a machine with SVE support. Signed-off-by: Andrew Jones Message-Id: <20201029201703.102716-5-drjones@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/aarch64/get-reg-list-sve.c | 3 + tools/testing/selftests/kvm/aarch64/get-reg-list.c | 254 +++++++++++++++++---- 4 files changed, 217 insertions(+), 42 deletions(-) create mode 100644 tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 2034765862a2..ea1692d43411 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only /aarch64/get-reg-list +/aarch64/get-reg-list-sve /s390x/memop /s390x/resets /s390x/sync_regs_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 771455ae4a1b..2b0d7d325e2a 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -67,6 +67,7 @@ TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list +TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve TEST_GEN_PROGS_aarch64 += clear_dirty_log_test TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c new file mode 100644 index 000000000000..efba76682b4b --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0 +#define REG_LIST_SVE +#include "get-reg-list.c" diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 3ff097f6886e..33218a395d9f 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -29,6 +29,13 @@ #include #include "kvm_util.h" #include "test_util.h" +#include "processor.h" + +#ifdef REG_LIST_SVE +#define reg_list_sve() (true) +#else +#define reg_list_sve() (false) +#endif #define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK) @@ -46,8 +53,9 @@ static struct kvm_reg_list *reg_list; -static __u64 blessed_reg[]; -static __u64 blessed_n; +static __u64 base_regs[], vregs[], sve_regs[], rejects_set[]; +static __u64 base_regs_n, vregs_n, sve_regs_n, rejects_set_n; +static __u64 *blessed_reg, blessed_n; static bool find_reg(__u64 regs[], __u64 nr_regs, __u64 reg) { @@ -119,6 +127,40 @@ static const char *core_id_to_str(__u64 id) return NULL; } +static const char *sve_id_to_str(__u64 id) +{ + __u64 sve_off, n, i; + + if (id == KVM_REG_ARM64_SVE_VLS) + return "KVM_REG_ARM64_SVE_VLS"; + + sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1)); + i = id & (KVM_ARM64_SVE_MAX_SLICES - 1); + + TEST_ASSERT(i == 0, "Currently we don't expect slice > 0, reg id 0x%llx", id); + + switch (sve_off) { + case KVM_REG_ARM64_SVE_ZREG_BASE ... + KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1: + n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1); + TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0), + "Unexpected bits set in SVE ZREG id: 0x%llx", id); + return str_with_index("KVM_REG_ARM64_SVE_ZREG(##, 0)", n); + case KVM_REG_ARM64_SVE_PREG_BASE ... + KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1: + n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1); + TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0), + "Unexpected bits set in SVE PREG id: 0x%llx", id); + return str_with_index("KVM_REG_ARM64_SVE_PREG(##, 0)", n); + case KVM_REG_ARM64_SVE_FFR_BASE: + TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0), + "Unexpected bits set in SVE FFR id: 0x%llx", id); + return "KVM_REG_ARM64_SVE_FFR(0)"; + } + + return NULL; +} + static void print_reg(__u64 id) { unsigned op0, op1, crn, crm, op2; @@ -186,7 +228,10 @@ static void print_reg(__u64 id) printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff); break; case KVM_REG_ARM64_SVE: - TEST_FAIL("KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", id); + if (reg_list_sve()) + printf("\t%s,\n", sve_id_to_str(id)); + else + TEST_FAIL("KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", id); break; default: TEST_FAIL("Unexpected coproc type: 0x%llx in reg id: 0x%llx", @@ -251,12 +296,40 @@ static void core_reg_fixup(void) reg_list = tmp; } +static void prepare_vcpu_init(struct kvm_vcpu_init *init) +{ + if (reg_list_sve()) + init->features[0] |= 1 << KVM_ARM_VCPU_SVE; +} + +static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +{ + int feature; + + if (reg_list_sve()) { + feature = KVM_ARM_VCPU_SVE; + vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature); + } +} + +static void check_supported(void) +{ + if (reg_list_sve() && !kvm_check_cap(KVM_CAP_ARM_SVE)) { + fprintf(stderr, "SVE not available, skipping tests\n"); + exit(KSFT_SKIP); + } +} + int main(int ac, char **av) { + struct kvm_vcpu_init init = { .target = -1, }; int new_regs = 0, missing_regs = 0, i; - int failed_get = 0, failed_set = 0; + int failed_get = 0, failed_set = 0, failed_reject = 0; bool print_list = false, fixup_core_regs = false; struct kvm_vm *vm; + __u64 *vec_regs; + + check_supported(); for (i = 1; i < ac; ++i) { if (strcmp(av[i], "--core-reg-fixup") == 0) @@ -267,7 +340,11 @@ int main(int ac, char **av) fprintf(stderr, "Ignoring unknown option: %s\n", av[i]); } - vm = vm_create_default(0, 0, NULL); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + prepare_vcpu_init(&init); + aarch64_vcpu_add_default(vm, 0, &init, NULL); + finalize_vcpu(vm, 0); + reg_list = vcpu_get_reg_list(vm, 0); if (fixup_core_regs) @@ -307,6 +384,18 @@ int main(int ac, char **av) ++failed_get; } + /* rejects_set registers are rejected after KVM_ARM_VCPU_FINALIZE */ + if (find_reg(rejects_set, rejects_set_n, reg.id)) { + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + if (ret != -1 || errno != EPERM) { + printf("Failed to reject (ret=%d, errno=%d) ", ret, errno); + print_reg(reg.id); + putchar('\n'); + ++failed_reject; + } + continue; + } + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); if (ret) { puts("Failed to set "); @@ -316,6 +405,20 @@ int main(int ac, char **av) } } + if (reg_list_sve()) { + blessed_n = base_regs_n + sve_regs_n; + vec_regs = sve_regs; + } else { + blessed_n = base_regs_n + vregs_n; + vec_regs = vregs; + } + + blessed_reg = calloc(blessed_n, sizeof(__u64)); + for (i = 0; i < base_regs_n; ++i) + blessed_reg[i] = base_regs[i]; + for (i = 0; i < blessed_n - base_regs_n; ++i) + blessed_reg[base_regs_n + i] = vec_regs[i]; + for_each_new_reg(i) ++new_regs; @@ -344,9 +447,10 @@ int main(int ac, char **av) putchar('\n'); } - TEST_ASSERT(!missing_regs && !failed_get && !failed_set, - "There are %d missing registers; %d registers failed get; %d registers failed set", - missing_regs, failed_get, failed_set); + TEST_ASSERT(!missing_regs && !failed_get && !failed_set && !failed_reject, + "There are %d missing registers; " + "%d registers failed get; %d registers failed set; %d registers failed reject", + missing_regs, failed_get, failed_set, failed_reject); return 0; } @@ -355,7 +459,7 @@ int main(int ac, char **av) * The current blessed list was primed with the output of kernel version * v4.15 with --core-reg-fixup and then later updated with new registers. */ -static __u64 blessed_reg[] = { +static __u64 base_regs[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]), KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[1]), KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[2]), @@ -397,38 +501,6 @@ static __u64 blessed_reg[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[2]), KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[3]), KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]), - KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]), KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr), KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr), KVM_REG_ARM_FW_REG(0), @@ -668,4 +740,102 @@ static __u64 blessed_reg[] = { KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 1, KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 2, }; -static __u64 blessed_n = ARRAY_SIZE(blessed_reg); +static __u64 base_regs_n = ARRAY_SIZE(base_regs); + +static __u64 vregs[] = { + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]), +}; +static __u64 vregs_n = ARRAY_SIZE(vregs); + +static __u64 sve_regs[] = { + KVM_REG_ARM64_SVE_VLS, + KVM_REG_ARM64_SVE_ZREG(0, 0), + KVM_REG_ARM64_SVE_ZREG(1, 0), + KVM_REG_ARM64_SVE_ZREG(2, 0), + KVM_REG_ARM64_SVE_ZREG(3, 0), + KVM_REG_ARM64_SVE_ZREG(4, 0), + KVM_REG_ARM64_SVE_ZREG(5, 0), + KVM_REG_ARM64_SVE_ZREG(6, 0), + KVM_REG_ARM64_SVE_ZREG(7, 0), + KVM_REG_ARM64_SVE_ZREG(8, 0), + KVM_REG_ARM64_SVE_ZREG(9, 0), + KVM_REG_ARM64_SVE_ZREG(10, 0), + KVM_REG_ARM64_SVE_ZREG(11, 0), + KVM_REG_ARM64_SVE_ZREG(12, 0), + KVM_REG_ARM64_SVE_ZREG(13, 0), + KVM_REG_ARM64_SVE_ZREG(14, 0), + KVM_REG_ARM64_SVE_ZREG(15, 0), + KVM_REG_ARM64_SVE_ZREG(16, 0), + KVM_REG_ARM64_SVE_ZREG(17, 0), + KVM_REG_ARM64_SVE_ZREG(18, 0), + KVM_REG_ARM64_SVE_ZREG(19, 0), + KVM_REG_ARM64_SVE_ZREG(20, 0), + KVM_REG_ARM64_SVE_ZREG(21, 0), + KVM_REG_ARM64_SVE_ZREG(22, 0), + KVM_REG_ARM64_SVE_ZREG(23, 0), + KVM_REG_ARM64_SVE_ZREG(24, 0), + KVM_REG_ARM64_SVE_ZREG(25, 0), + KVM_REG_ARM64_SVE_ZREG(26, 0), + KVM_REG_ARM64_SVE_ZREG(27, 0), + KVM_REG_ARM64_SVE_ZREG(28, 0), + KVM_REG_ARM64_SVE_ZREG(29, 0), + KVM_REG_ARM64_SVE_ZREG(30, 0), + KVM_REG_ARM64_SVE_ZREG(31, 0), + KVM_REG_ARM64_SVE_PREG(0, 0), + KVM_REG_ARM64_SVE_PREG(1, 0), + KVM_REG_ARM64_SVE_PREG(2, 0), + KVM_REG_ARM64_SVE_PREG(3, 0), + KVM_REG_ARM64_SVE_PREG(4, 0), + KVM_REG_ARM64_SVE_PREG(5, 0), + KVM_REG_ARM64_SVE_PREG(6, 0), + KVM_REG_ARM64_SVE_PREG(7, 0), + KVM_REG_ARM64_SVE_PREG(8, 0), + KVM_REG_ARM64_SVE_PREG(9, 0), + KVM_REG_ARM64_SVE_PREG(10, 0), + KVM_REG_ARM64_SVE_PREG(11, 0), + KVM_REG_ARM64_SVE_PREG(12, 0), + KVM_REG_ARM64_SVE_PREG(13, 0), + KVM_REG_ARM64_SVE_PREG(14, 0), + KVM_REG_ARM64_SVE_PREG(15, 0), + KVM_REG_ARM64_SVE_FFR(0), + ARM64_SYS_REG(3, 0, 1, 2, 0), /* ZCR_EL1 */ +}; +static __u64 sve_regs_n = ARRAY_SIZE(sve_regs); + +static __u64 rejects_set[] = { +#ifdef REG_LIST_SVE + KVM_REG_ARM64_SVE_VLS, +#endif +}; +static __u64 rejects_set_n = ARRAY_SIZE(rejects_set); -- cgit v1.2.3 From 3031e0288e60f09533339e61117b83099a6e126e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 30 Sep 2020 21:22:28 -0400 Subject: KVM: selftests: Always clear dirty bitmap after iteration We used not to clear the dirty bitmap before because KVM_GET_DIRTY_LOG would overwrite it the next time it copies the dirty log onto it. In the upcoming dirty ring tests we'll start to fetch dirty pages from a ring buffer, so no one is going to clear the dirty bitmap for us. Reviewed-by: Andrew Jones Signed-off-by: Peter Xu Message-Id: <20201001012228.5916-1-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 752ec158ac59..6a8275a22861 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -195,7 +195,7 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) page); } - if (test_bit_le(page, bmap)) { + if (test_and_clear_bit_le(page, bmap)) { host_dirty_count++; /* * If the bit is set, the value written onto -- cgit v1.2.3 From afdb1960071935cfd5c1908691a34cc6e36931f7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 30 Sep 2020 21:22:33 -0400 Subject: KVM: selftests: Use a single binary for dirty/clear log test Remove the clear_dirty_log test, instead merge it into the existing dirty_log_test. It should be cleaner to use this single binary to do both tests, also it's a preparation for the upcoming dirty ring test. The default behavior will run all the modes in sequence. Reviewed-by: Andrew Jones Signed-off-by: Peter Xu Message-Id: <20201001012233.6013-1-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 2 - tools/testing/selftests/kvm/clear_dirty_log_test.c | 6 - tools/testing/selftests/kvm/dirty_log_test.c | 187 +++++++++++++++++---- 3 files changed, 156 insertions(+), 39 deletions(-) delete mode 100644 tools/testing/selftests/kvm/clear_dirty_log_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 2b0d7d325e2a..c8af04dccf7f 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -59,7 +59,6 @@ TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test TEST_GEN_PROGS_x86_64 += x86_64/debug_regs TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test -TEST_GEN_PROGS_x86_64 += clear_dirty_log_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus @@ -68,7 +67,6 @@ TEST_GEN_PROGS_x86_64 += steal_time TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve -TEST_GEN_PROGS_aarch64 += clear_dirty_log_test TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus diff --git a/tools/testing/selftests/kvm/clear_dirty_log_test.c b/tools/testing/selftests/kvm/clear_dirty_log_test.c deleted file mode 100644 index 11672ec6f74e..000000000000 --- a/tools/testing/selftests/kvm/clear_dirty_log_test.c +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_CLEAR_DIRTY_LOG -#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) -#define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) -#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ - KVM_DIRTY_LOG_INITIALLY_SET) -#include "dirty_log_test.c" diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 6a8275a22861..139ccb550618 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -128,6 +128,78 @@ static uint64_t host_dirty_count; static uint64_t host_clear_count; static uint64_t host_track_next_count; +enum log_mode_t { + /* Only use KVM_GET_DIRTY_LOG for logging */ + LOG_MODE_DIRTY_LOG = 0, + + /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */ + LOG_MODE_CLEAR_LOG = 1, + + LOG_MODE_NUM, + + /* Run all supported modes */ + LOG_MODE_ALL = LOG_MODE_NUM, +}; + +/* Mode of logging to test. Default is to run all supported modes */ +static enum log_mode_t host_log_mode_option = LOG_MODE_ALL; +/* Logging mode for current run */ +static enum log_mode_t host_log_mode; + +static bool clear_log_supported(void) +{ + return kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); +} + +static void clear_log_create_vm_done(struct kvm_vm *vm) +{ + struct kvm_enable_cap cap = {}; + u64 manual_caps; + + manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!"); + manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); + cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; + cap.args[0] = manual_caps; + vm_enable_cap(vm, &cap); +} + +static void dirty_log_collect_dirty_pages(struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages) +{ + kvm_vm_get_dirty_log(vm, slot, bitmap); +} + +static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages) +{ + kvm_vm_get_dirty_log(vm, slot, bitmap); + kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages); +} + +struct log_mode { + const char *name; + /* Return true if this mode is supported, otherwise false */ + bool (*supported)(void); + /* Hook when the vm creation is done (before vcpu creation) */ + void (*create_vm_done)(struct kvm_vm *vm); + /* Hook to collect the dirty pages into the bitmap provided */ + void (*collect_dirty_pages) (struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages); +} log_modes[LOG_MODE_NUM] = { + { + .name = "dirty-log", + .collect_dirty_pages = dirty_log_collect_dirty_pages, + }, + { + .name = "clear-log", + .supported = clear_log_supported, + .create_vm_done = clear_log_create_vm_done, + .collect_dirty_pages = clear_log_collect_dirty_pages, + }, +}; + /* * We use this bitmap to track some pages that should have its dirty * bit set in the _next_ iteration. For example, if we detected the @@ -137,6 +209,44 @@ static uint64_t host_track_next_count; */ static unsigned long *host_bmap_track; +static void log_modes_dump(void) +{ + int i; + + printf("all"); + for (i = 0; i < LOG_MODE_NUM; i++) + printf(", %s", log_modes[i].name); + printf("\n"); +} + +static bool log_mode_supported(void) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + if (mode->supported) + return mode->supported(); + + return true; +} + +static void log_mode_create_vm_done(struct kvm_vm *vm) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + if (mode->create_vm_done) + mode->create_vm_done(vm); +} + +static void log_mode_collect_dirty_pages(struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + TEST_ASSERT(mode->collect_dirty_pages != NULL, + "collect_dirty_pages() is required for any log mode!"); + mode->collect_dirty_pages(vm, slot, bitmap, num_pages); +} + static void generate_random_array(uint64_t *guest_array, uint64_t size) { uint64_t i; @@ -257,6 +367,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, #ifdef __x86_64__ vm_create_irqchip(vm); #endif + log_mode_create_vm_done(vm); vm_vcpu_add_default(vm, vcpuid, guest_code); return vm; } @@ -264,10 +375,6 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, #define DIRTY_MEM_BITS 30 /* 1G */ #define PAGE_SHIFT_4K 12 -#ifdef USE_CLEAR_DIRTY_LOG -static u64 dirty_log_manual_caps; -#endif - static void run_test(enum vm_guest_mode mode, unsigned long iterations, unsigned long interval, uint64_t phys_offset) { @@ -275,6 +382,12 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, struct kvm_vm *vm; unsigned long *bmap; + if (!log_mode_supported()) { + print_skip("Log mode '%s' not supported", + log_modes[host_log_mode].name); + return; + } + /* * We reserve page table for 2 times of extra dirty mem which * will definitely cover the original (1G+) test range. Here @@ -317,14 +430,6 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, bmap = bitmap_alloc(host_num_pages); host_bmap_track = bitmap_alloc(host_num_pages); -#ifdef USE_CLEAR_DIRTY_LOG - struct kvm_enable_cap cap = {}; - - cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; - cap.args[0] = dirty_log_manual_caps; - vm_enable_cap(vm, &cap); -#endif - /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, guest_test_phys_mem, @@ -362,11 +467,8 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, while (iteration < iterations) { /* Give the vcpu thread some time to dirty some pages */ usleep(interval * 1000); - kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); -#ifdef USE_CLEAR_DIRTY_LOG - kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, - host_num_pages); -#endif + log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX, + bmap, host_num_pages); vm_dirty_log_verify(mode, bmap); iteration++; sync_global_to_guest(vm, iteration); @@ -410,6 +512,9 @@ static void help(char *name) TEST_HOST_LOOP_INTERVAL); printf(" -p: specify guest physical test memory offset\n" " Warning: a low offset can conflict with the loaded test code.\n"); + printf(" -M: specify the host logging mode " + "(default: run all log modes). Supported modes: \n\t"); + log_modes_dump(); printf(" -m: specify the guest mode ID to test " "(default: test all supported modes)\n" " This option may be used multiple times.\n" @@ -429,18 +534,7 @@ int main(int argc, char *argv[]) bool mode_selected = false; uint64_t phys_offset = 0; unsigned int mode; - int opt, i; - -#ifdef USE_CLEAR_DIRTY_LOG - dirty_log_manual_caps = - kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); - if (!dirty_log_manual_caps) { - print_skip("KVM_CLEAR_DIRTY_LOG not available"); - exit(KSFT_SKIP); - } - dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | - KVM_DIRTY_LOG_INITIALLY_SET); -#endif + int opt, i, j; #ifdef __x86_64__ guest_mode_init(VM_MODE_PXXV48_4K, true, true); @@ -464,7 +558,7 @@ int main(int argc, char *argv[]) guest_mode_init(VM_MODE_P40V48_4K, true, true); #endif - while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) { + while ((opt = getopt(argc, argv, "hi:I:p:m:M:")) != -1) { switch (opt) { case 'i': iterations = strtol(optarg, NULL, 10); @@ -486,6 +580,26 @@ int main(int argc, char *argv[]) "Guest mode ID %d too big", mode); guest_modes[mode].enabled = true; break; + case 'M': + if (!strcmp(optarg, "all")) { + host_log_mode_option = LOG_MODE_ALL; + break; + } + for (i = 0; i < LOG_MODE_NUM; i++) { + if (!strcmp(optarg, log_modes[i].name)) { + pr_info("Setting log mode to: '%s'\n", + optarg); + host_log_mode_option = i; + break; + } + } + if (i == LOG_MODE_NUM) { + printf("Log mode '%s' invalid. Please choose " + "from: ", optarg); + log_modes_dump(); + exit(1); + } + break; case 'h': default: help(argv[0]); @@ -507,7 +621,18 @@ int main(int argc, char *argv[]) TEST_ASSERT(guest_modes[i].supported, "Guest mode ID %d (%s) not supported.", i, vm_guest_mode_string(i)); - run_test(i, iterations, interval, phys_offset); + if (host_log_mode_option == LOG_MODE_ALL) { + /* Run each log mode */ + for (j = 0; j < LOG_MODE_NUM; j++) { + pr_info("Testing Log Mode '%s'\n", + log_modes[j].name); + host_log_mode = j; + run_test(i, iterations, interval, phys_offset); + } + } else { + host_log_mode = host_log_mode_option; + run_test(i, iterations, interval, phys_offset); + } } return 0; -- cgit v1.2.3 From 4b5d12b0e21cc9f9f00201819844fcafb020ffad Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 27 Oct 2020 16:37:29 -0700 Subject: KVM: selftests: Factor code out of demand_paging_test Much of the code in demand_paging_test can be reused by other, similar multi-vCPU-memory-touching-perfromance-tests. Factor that common code out for reuse. No functional change expected. This series was tested by running the following invocations on an Intel Skylake machine: dirty_log_perf_test -b 20m -i 100 -v 64 dirty_log_perf_test -b 20g -i 5 -v 4 dirty_log_perf_test -b 4g -i 5 -v 32 demand_paging_test -b 20m -v 64 demand_paging_test -b 20g -v 4 demand_paging_test -b 4g -v 32 All behaved as expected. Signed-off-by: Ben Gardon Message-Id: <20201027233733.1484855-2-bgardon@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 204 +++------------------ .../testing/selftests/kvm/include/perf_test_util.h | 187 +++++++++++++++++++ 2 files changed, 210 insertions(+), 181 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/perf_test_util.h diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 360cd3ea4cd6..4251e98ceb69 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -21,18 +21,12 @@ #include #include -#include "test_util.h" -#include "kvm_util.h" +#include "perf_test_util.h" #include "processor.h" +#include "test_util.h" #ifdef __NR_userfaultfd -/* The memory slot index demand page */ -#define TEST_MEM_SLOT_INDEX 1 - -/* Default guest test virtual memory offset */ -#define DEFAULT_GUEST_TEST_MEM 0xc0000000 - #define DEFAULT_GUEST_TEST_MEM_SIZE (1 << 30) /* 1G */ #ifdef PRINT_PER_PAGE_UPDATES @@ -47,75 +41,14 @@ #define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__) #endif -#define MAX_VCPUS 512 - -/* - * Guest/Host shared variables. Ensure addr_gva2hva() and/or - * sync_global_to/from_guest() are used when accessing from - * the host. READ/WRITE_ONCE() should also be used with anything - * that may change. - */ -static uint64_t host_page_size; -static uint64_t guest_page_size; - static char *guest_data_prototype; -/* - * Guest physical memory offset of the testing memory slot. - * This will be set to the topmost valid physical address minus - * the test memory size. - */ -static uint64_t guest_test_phys_mem; - -/* - * Guest virtual memory offset of the testing memory slot. - * Must not conflict with identity mapped test code. - */ -static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; - -struct vcpu_args { - uint64_t gva; - uint64_t pages; - - /* Only used by the host userspace part of the vCPU thread */ - int vcpu_id; - struct kvm_vm *vm; -}; - -static struct vcpu_args vcpu_args[MAX_VCPUS]; - -/* - * Continuously write to the first 8 bytes of each page in the demand paging - * memory region. - */ -static void guest_code(uint32_t vcpu_id) -{ - uint64_t gva; - uint64_t pages; - int i; - - /* Make sure vCPU args data structure is not corrupt. */ - GUEST_ASSERT(vcpu_args[vcpu_id].vcpu_id == vcpu_id); - - gva = vcpu_args[vcpu_id].gva; - pages = vcpu_args[vcpu_id].pages; - - for (i = 0; i < pages; i++) { - uint64_t addr = gva + (i * guest_page_size); - - addr &= ~(host_page_size - 1); - *(uint64_t *)addr = 0x0123456789ABCDEF; - } - - GUEST_SYNC(1); -} - static void *vcpu_worker(void *data) { int ret; - struct vcpu_args *args = (struct vcpu_args *)data; - struct kvm_vm *vm = args->vm; - int vcpu_id = args->vcpu_id; + struct vcpu_args *vcpu_args = (struct vcpu_args *)data; + int vcpu_id = vcpu_args->vcpu_id; + struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; struct timespec start, end, ts_diff; @@ -141,39 +74,6 @@ static void *vcpu_worker(void *data) return NULL; } -#define PAGE_SHIFT_4K 12 -#define PTES_PER_4K_PT 512 - -static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes) -{ - struct kvm_vm *vm; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES; - - /* Account for a few pages per-vCPU for stacks */ - pages += DEFAULT_STACK_PGS * vcpus; - - /* - * Reserve twice the ammount of memory needed to map the test region and - * the page table / stacks region, at 4k, for page tables. Do the - * calculation with 4K page size: the smallest of all archs. (e.g., 64K - * page size guest will need even less memory for page tables). - */ - pages += (2 * pages) / PTES_PER_4K_PT; - pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) / - PTES_PER_4K_PT; - pages = vm_adjust_num_guest_pages(mode, pages); - - pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - - vm = _vm_create(mode, pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); -#ifdef __x86_64__ - vm_create_irqchip(vm); -#endif - return vm; -} - static int handle_uffd_page_request(int uffd, uint64_t addr) { pid_t tid; @@ -186,7 +86,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) copy.src = (uint64_t)guest_data_prototype; copy.dst = addr; - copy.len = host_page_size; + copy.len = perf_test_args.host_page_size; copy.mode = 0; clock_gettime(CLOCK_MONOTONIC, &start); @@ -203,7 +103,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, timespec_to_ns(timespec_sub(end, start))); PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", - host_page_size, addr, tid); + perf_test_args.host_page_size, addr, tid); return 0; } @@ -360,64 +260,21 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, struct timespec start, end, ts_diff; int *pipefds = NULL; struct kvm_vm *vm; - uint64_t guest_num_pages; int vcpu_id; int r; vm = create_vm(mode, vcpus, vcpu_memory_bytes); - guest_page_size = vm_get_page_size(vm); - - TEST_ASSERT(vcpu_memory_bytes % guest_page_size == 0, - "Guest memory size is not guest page size aligned."); - - guest_num_pages = (vcpus * vcpu_memory_bytes) / guest_page_size; - guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); - - /* - * If there should be more memory in the guest test region than there - * can be pages in the guest, it will definitely cause problems. - */ - TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), - "Requested more guest memory than address space allows.\n" - " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", - guest_num_pages, vm_get_max_gfn(vm), vcpus, - vcpu_memory_bytes); - - host_page_size = getpagesize(); - TEST_ASSERT(vcpu_memory_bytes % host_page_size == 0, - "Guest memory size is not host page size aligned."); - - guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * - guest_page_size; - guest_test_phys_mem &= ~(host_page_size - 1); - -#ifdef __s390x__ - /* Align to 1M (segment size) */ - guest_test_phys_mem &= ~((1 << 20) - 1); -#endif - - pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - - /* Add an extra memory slot for testing demand paging */ - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_test_phys_mem, - TEST_MEM_SLOT_INDEX, - guest_num_pages, 0); - - /* Do mapping for the demand paging memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); - - ucall_init(vm, NULL); - - guest_data_prototype = malloc(host_page_size); + guest_data_prototype = malloc(perf_test_args.host_page_size); TEST_ASSERT(guest_data_prototype, "Failed to allocate buffer for guest data pattern"); - memset(guest_data_prototype, 0xAB, host_page_size); + memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size); vcpu_threads = malloc(vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); + add_vcpus(vm, vcpus, vcpu_memory_bytes); + if (use_uffd) { uffd_handler_threads = malloc(vcpus * sizeof(*uffd_handler_threads)); @@ -428,22 +285,18 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, pipefds = malloc(sizeof(int) * vcpus * 2); TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); - } - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - vm_paddr_t vcpu_gpa; - void *vcpu_hva; - - vm_vcpu_add_default(vm, vcpu_id, guest_code); + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + vm_paddr_t vcpu_gpa; + void *vcpu_hva; - vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); - PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); + vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); + PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", + vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); - /* Cache the HVA pointer of the region */ - vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); + /* Cache the HVA pointer of the region */ + vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); - if (use_uffd) { /* * Set up user fault fd to handle demand paging * requests. @@ -460,22 +313,10 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, if (r < 0) exit(-r); } - -#ifdef __x86_64__ - vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid()); -#endif - - vcpu_args[vcpu_id].vm = vm; - vcpu_args[vcpu_id].vcpu_id = vcpu_id; - vcpu_args[vcpu_id].gva = guest_test_virt_mem + - (vcpu_id * vcpu_memory_bytes); - vcpu_args[vcpu_id].pages = vcpu_memory_bytes / guest_page_size; } /* Export the shared variables to the guest */ - sync_global_to_guest(vm, host_page_size); - sync_global_to_guest(vm, guest_page_size); - sync_global_to_guest(vm, vcpu_args); + sync_global_to_guest(vm, perf_test_args); pr_info("Finished creating vCPUs and starting uffd threads\n"); @@ -483,7 +324,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, - &vcpu_args[vcpu_id]); + &perf_test_args.vcpu_args[vcpu_id]); } pr_info("Started all vCPUs\n"); @@ -514,7 +355,8 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, pr_info("Total guest execution time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); pr_info("Overall demand paging rate: %f pgs/sec\n", - guest_num_pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); + perf_test_args.vcpu_args[0].pages * vcpus / + ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); ucall_uninit(vm); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h new file mode 100644 index 000000000000..f71f0858a1f2 --- /dev/null +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tools/testing/selftests/kvm/include/perf_test_util.h + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef SELFTEST_KVM_PERF_TEST_UTIL_H +#define SELFTEST_KVM_PERF_TEST_UTIL_H + +#include "kvm_util.h" +#include "processor.h" + +#define MAX_VCPUS 512 + +#define PAGE_SHIFT_4K 12 +#define PTES_PER_4K_PT 512 + +#define TEST_MEM_SLOT_INDEX 1 + +/* Default guest test virtual memory offset */ +#define DEFAULT_GUEST_TEST_MEM 0xc0000000 + +/* + * Guest physical memory offset of the testing memory slot. + * This will be set to the topmost valid physical address minus + * the test memory size. + */ +static uint64_t guest_test_phys_mem; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; + +struct vcpu_args { + uint64_t gva; + uint64_t pages; + + /* Only used by the host userspace part of the vCPU thread */ + int vcpu_id; +}; + +struct perf_test_args { + struct kvm_vm *vm; + uint64_t host_page_size; + uint64_t guest_page_size; + + struct vcpu_args vcpu_args[MAX_VCPUS]; +}; + +static struct perf_test_args perf_test_args; + +/* + * Continuously write to the first 8 bytes of each page in the + * specified region. + */ +static void guest_code(uint32_t vcpu_id) +{ + struct vcpu_args *vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + uint64_t gva; + uint64_t pages; + int i; + + /* Make sure vCPU args data structure is not corrupt. */ + GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id); + + gva = vcpu_args->gva; + pages = vcpu_args->pages; + + for (i = 0; i < pages; i++) { + uint64_t addr = gva + (i * perf_test_args.guest_page_size); + + addr &= ~(perf_test_args.host_page_size - 1); + *(uint64_t *)addr = 0x0123456789ABCDEF; + } + + GUEST_SYNC(1); +} + +static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, + uint64_t vcpu_memory_bytes) +{ + struct kvm_vm *vm; + uint64_t pages = DEFAULT_GUEST_PHY_PAGES; + uint64_t guest_num_pages; + + /* Account for a few pages per-vCPU for stacks */ + pages += DEFAULT_STACK_PGS * vcpus; + + /* + * Reserve twice the ammount of memory needed to map the test region and + * the page table / stacks region, at 4k, for page tables. Do the + * calculation with 4K page size: the smallest of all archs. (e.g., 64K + * page size guest will need even less memory for page tables). + */ + pages += (2 * pages) / PTES_PER_4K_PT; + pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) / + PTES_PER_4K_PT; + pages = vm_adjust_num_guest_pages(mode, pages); + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + + vm = _vm_create(mode, pages, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); +#ifdef __x86_64__ + vm_create_irqchip(vm); +#endif + + perf_test_args.vm = vm; + perf_test_args.guest_page_size = vm_get_page_size(vm); + perf_test_args.host_page_size = getpagesize(); + + TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, + "Guest memory size is not guest page size aligned."); + + guest_num_pages = (vcpus * vcpu_memory_bytes) / + perf_test_args.guest_page_size; + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + + /* + * If there should be more memory in the guest test region than there + * can be pages in the guest, it will definitely cause problems. + */ + TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), + "Requested more guest memory than address space allows.\n" + " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", + guest_num_pages, vm_get_max_gfn(vm), vcpus, + vcpu_memory_bytes); + + TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0, + "Guest memory size is not host page size aligned."); + + guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * + perf_test_args.guest_page_size; + guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1); + +#ifdef __s390x__ + /* Align to 1M (segment size) */ + guest_test_phys_mem &= ~((1 << 20) - 1); +#endif + + pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); + + /* Add an extra memory slot for testing */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + guest_test_phys_mem, + TEST_MEM_SLOT_INDEX, + guest_num_pages, 0); + + /* Do mapping for the demand paging memory slot */ + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + + ucall_init(vm, NULL); + + return vm; +} + +static void add_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes) +{ + vm_paddr_t vcpu_gpa; + struct vcpu_args *vcpu_args; + int vcpu_id; + + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + + vm_vcpu_add_default(vm, vcpu_id, guest_code); + +#ifdef __x86_64__ + vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid()); +#endif + + vcpu_args->vcpu_id = vcpu_id; + vcpu_args->gva = guest_test_virt_mem + + (vcpu_id * vcpu_memory_bytes); + vcpu_args->pages = vcpu_memory_bytes / + perf_test_args.guest_page_size; + + vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", + vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); + } +} + +#endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ -- cgit v1.2.3 From 2fe5149bdfbf3c2cdfafd2b5b496252d45ca1f78 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 27 Oct 2020 16:37:30 -0700 Subject: KVM: selftests: Remove address rounding in guest code Rounding the address the guest writes to a host page boundary will only have an effect if the host page size is larger than the guest page size, but in that case the guest write would still go to the same host page. There's no reason to round the address down, so remove the rounding to simplify the demand paging test. This series was tested by running the following invocations on an Intel Skylake machine: dirty_log_perf_test -b 20m -i 100 -v 64 dirty_log_perf_test -b 20g -i 5 -v 4 dirty_log_perf_test -b 4g -i 5 -v 32 demand_paging_test -b 20m -v 64 demand_paging_test -b 20g -v 4 demand_paging_test -b 4g -v 32 All behaved as expected. Signed-off-by: Ben Gardon Message-Id: <20201027233733.1484855-3-bgardon@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/perf_test_util.h | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index f71f0858a1f2..838f946700f0 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -72,7 +72,6 @@ static void guest_code(uint32_t vcpu_id) for (i = 0; i < pages; i++) { uint64_t addr = gva + (i * perf_test_args.guest_page_size); - addr &= ~(perf_test_args.host_page_size - 1); *(uint64_t *)addr = 0x0123456789ABCDEF; } -- cgit v1.2.3 From 1eafbd27edb5098ed6b6bc404c35d56c78beb0fd Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 27 Oct 2020 16:37:31 -0700 Subject: KVM: selftests: Simplify demand_paging_test with timespec_diff_now Add a helper function to get the current time and return the time since a given start time. Use that function to simplify the timekeeping in the demand paging test. This series was tested by running the following invocations on an Intel Skylake machine: dirty_log_perf_test -b 20m -i 100 -v 64 dirty_log_perf_test -b 20g -i 5 -v 4 dirty_log_perf_test -b 4g -i 5 -v 32 demand_paging_test -b 20m -v 64 demand_paging_test -b 20g -v 4 demand_paging_test -b 4g -v 32 All behaved as expected. Signed-off-by: Ben Gardon Message-Id: <20201027233733.1484855-4-bgardon@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 26 ++++++++++++------------ tools/testing/selftests/kvm/include/test_util.h | 1 + tools/testing/selftests/kvm/lib/test_util.c | 15 ++++++++++++-- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 4251e98ceb69..7de6feb00076 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -50,7 +50,8 @@ static void *vcpu_worker(void *data) int vcpu_id = vcpu_args->vcpu_id; struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; - struct timespec start, end, ts_diff; + struct timespec start; + struct timespec ts_diff; vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); @@ -66,8 +67,7 @@ static void *vcpu_worker(void *data) exit_reason_str(run->exit_reason)); } - clock_gettime(CLOCK_MONOTONIC, &end); - ts_diff = timespec_sub(end, start); + ts_diff = timespec_diff_now(start); PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id, ts_diff.tv_sec, ts_diff.tv_nsec); @@ -78,7 +78,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) { pid_t tid; struct timespec start; - struct timespec end; + struct timespec ts_diff; struct uffdio_copy copy; int r; @@ -98,10 +98,10 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) return r; } - clock_gettime(CLOCK_MONOTONIC, &end); + ts_diff = timespec_diff_now(start); PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, - timespec_to_ns(timespec_sub(end, start))); + timespec_to_ns(ts_diff)); PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", perf_test_args.host_page_size, addr, tid); @@ -123,7 +123,8 @@ static void *uffd_handler_thread_fn(void *arg) int pipefd = uffd_args->pipefd; useconds_t delay = uffd_args->delay; int64_t pages = 0; - struct timespec start, end, ts_diff; + struct timespec start; + struct timespec ts_diff; clock_gettime(CLOCK_MONOTONIC, &start); while (!quit_uffd_thread) { @@ -192,8 +193,7 @@ static void *uffd_handler_thread_fn(void *arg) pages++; } - clock_gettime(CLOCK_MONOTONIC, &end); - ts_diff = timespec_sub(end, start); + ts_diff = timespec_diff_now(start); PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", pages, ts_diff.tv_sec, ts_diff.tv_nsec, pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); @@ -257,7 +257,8 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, pthread_t *vcpu_threads; pthread_t *uffd_handler_threads = NULL; struct uffd_handler_args *uffd_args = NULL; - struct timespec start, end, ts_diff; + struct timespec start; + struct timespec ts_diff; int *pipefds = NULL; struct kvm_vm *vm; int vcpu_id; @@ -335,9 +336,9 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id); } - pr_info("All vCPU threads joined\n"); + ts_diff = timespec_diff_now(start); - clock_gettime(CLOCK_MONOTONIC, &end); + pr_info("All vCPU threads joined\n"); if (use_uffd) { char c; @@ -351,7 +352,6 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, } } - ts_diff = timespec_sub(end, start); pr_info("Total guest execution time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); pr_info("Overall demand paging rate: %f pgs/sec\n", diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 5eb01bf51b86..1cc036ddb0c5 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -64,5 +64,6 @@ int64_t timespec_to_ns(struct timespec ts); struct timespec timespec_add_ns(struct timespec ts, int64_t ns); struct timespec timespec_add(struct timespec ts1, struct timespec ts2); struct timespec timespec_sub(struct timespec ts1, struct timespec ts2); +struct timespec timespec_diff_now(struct timespec start); #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 689e97c27ee2..1a46c2c48c7c 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -4,10 +4,13 @@ * * Copyright (C) 2020, Google LLC. */ -#include + +#include #include #include -#include +#include +#include + #include "test_util.h" /* @@ -81,6 +84,14 @@ struct timespec timespec_sub(struct timespec ts1, struct timespec ts2) return timespec_add_ns((struct timespec){0}, ns1 - ns2); } +struct timespec timespec_diff_now(struct timespec start) +{ + struct timespec end; + + clock_gettime(CLOCK_MONOTONIC, &end); + return timespec_sub(end, start); +} + void print_skip(const char *fmt, ...) { va_list ap; -- cgit v1.2.3 From 92ab4b9a22cfea9b0d353e86024208040c10e807 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 27 Oct 2020 16:37:32 -0700 Subject: KVM: selftests: Add wrfract to common guest code Wrfract will be used by the dirty logging perf test introduced later in this series to dirty memory sparsely. This series was tested by running the following invocations on an Intel Skylake machine: dirty_log_perf_test -b 20m -i 100 -v 64 dirty_log_perf_test -b 20g -i 5 -v 4 dirty_log_perf_test -b 4g -i 5 -v 32 demand_paging_test -b 20m -v 64 demand_paging_test -b 20g -v 4 demand_paging_test -b 4g -v 32 All behaved as expected. Signed-off-by: Ben Gardon Message-Id: <20201027233733.1484855-5-bgardon@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 2 ++ tools/testing/selftests/kvm/include/perf_test_util.h | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 7de6feb00076..47defc65aeda 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -266,6 +266,8 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, vm = create_vm(mode, vcpus, vcpu_memory_bytes); + perf_test_args.wr_fract = 1; + guest_data_prototype = malloc(perf_test_args.host_page_size); TEST_ASSERT(guest_data_prototype, "Failed to allocate buffer for guest data pattern"); diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 838f946700f0..1716300469c0 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -46,6 +46,7 @@ struct perf_test_args { struct kvm_vm *vm; uint64_t host_page_size; uint64_t guest_page_size; + int wr_fract; struct vcpu_args vcpu_args[MAX_VCPUS]; }; @@ -72,7 +73,10 @@ static void guest_code(uint32_t vcpu_id) for (i = 0; i < pages; i++) { uint64_t addr = gva + (i * perf_test_args.guest_page_size); - *(uint64_t *)addr = 0x0123456789ABCDEF; + if (i % perf_test_args.wr_fract == 0) + *(uint64_t *)addr = 0x0123456789ABCDEF; + else + READ_ONCE(*(uint64_t *)addr); } GUEST_SYNC(1); -- cgit v1.2.3 From f663132d1e09166db419afb9832d463e0a79f3d5 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 4 Nov 2020 22:23:48 +0100 Subject: KVM: selftests: Drop pointless vm_create wrapper Signed-off-by: Andrew Jones Message-Id: <20201104212357.171559-3-drjones@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 2 +- tools/testing/selftests/kvm/include/kvm_util.h | 1 - tools/testing/selftests/kvm/include/perf_test_util.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 7 +------ 4 files changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 139ccb550618..54da9cc20db4 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -362,7 +362,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); + vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); kvm_vm_elf_load(vm, program_invocation_name, 0, 0); #ifdef __x86_64__ vm_create_irqchip(vm); diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index feb949a92055..7d29aa786959 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -68,7 +68,6 @@ int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); -struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp, int perm); void kvm_vm_release(struct kvm_vm *vmp); diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 1716300469c0..8e053e9dc7ea 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -105,7 +105,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = _vm_create(mode, pages, O_RDWR); + vm = vm_create(mode, pages, O_RDWR); kvm_vm_elf_load(vm, program_invocation_name, 0, 0); #ifdef __x86_64__ vm_create_irqchip(vm); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 7669cb7c86e3..126c6727a6b0 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -180,7 +180,7 @@ _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) * descriptor to control the created VM is created with the permissions * given by perm (e.g. O_RDWR). */ -struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) +struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) { struct kvm_vm *vm; @@ -271,11 +271,6 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) return vm; } -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) -{ - return _vm_create(mode, phy_pages, perm); -} - /* * VM Restart * -- cgit v1.2.3 From 6769155fece2100506e22161945712afae61769f Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 4 Nov 2020 22:23:52 +0100 Subject: KVM: selftests: Make the per vcpu memory size global Rename vcpu_memory_bytes to something with "percpu" in it in order to be less ambiguous. Also make it global to simplify things. Signed-off-by: Andrew Jones Message-Id: <20201104212357.171559-7-drjones@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 20 ++++++++------------ tools/testing/selftests/kvm/include/perf_test_util.h | 3 +++ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 47defc65aeda..33acd954a298 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -27,8 +27,6 @@ #ifdef __NR_userfaultfd -#define DEFAULT_GUEST_TEST_MEM_SIZE (1 << 30) /* 1G */ - #ifdef PRINT_PER_PAGE_UPDATES #define PER_PAGE_DEBUG(...) printf(__VA_ARGS__) #else @@ -251,8 +249,7 @@ static int setup_demand_paging(struct kvm_vm *vm, } static void run_test(enum vm_guest_mode mode, bool use_uffd, - useconds_t uffd_delay, int vcpus, - uint64_t vcpu_memory_bytes) + useconds_t uffd_delay, int vcpus) { pthread_t *vcpu_threads; pthread_t *uffd_handler_threads = NULL; @@ -264,7 +261,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, int vcpu_id; int r; - vm = create_vm(mode, vcpus, vcpu_memory_bytes); + vm = create_vm(mode, vcpus, guest_percpu_mem_size); perf_test_args.wr_fract = 1; @@ -276,7 +273,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, vcpu_threads = malloc(vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - add_vcpus(vm, vcpus, vcpu_memory_bytes); + add_vcpus(vm, vcpus, guest_percpu_mem_size); if (use_uffd) { uffd_handler_threads = @@ -293,9 +290,9 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, vm_paddr_t vcpu_gpa; void *vcpu_hva; - vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); + vcpu_gpa = guest_test_phys_mem + (vcpu_id * guest_percpu_mem_size); PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); + vcpu_id, vcpu_gpa, vcpu_gpa + guest_percpu_mem_size); /* Cache the HVA pointer of the region */ vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); @@ -312,7 +309,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, &uffd_handler_threads[vcpu_id], pipefds[vcpu_id * 2], uffd_delay, &uffd_args[vcpu_id], - vcpu_hva, vcpu_memory_bytes); + vcpu_hva, guest_percpu_mem_size); if (r < 0) exit(-r); } @@ -413,7 +410,6 @@ static void help(char *name) int main(int argc, char *argv[]) { bool mode_selected = false; - uint64_t vcpu_memory_bytes = DEFAULT_GUEST_TEST_MEM_SIZE; int vcpus = 1; unsigned int mode; int opt, i; @@ -463,7 +459,7 @@ int main(int argc, char *argv[]) "A negative UFFD delay is not supported."); break; case 'b': - vcpu_memory_bytes = parse_size(optarg); + guest_percpu_mem_size = parse_size(optarg); break; case 'v': vcpus = atoi(optarg); @@ -486,7 +482,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(guest_modes[i].supported, "Guest mode ID %d (%s) not supported.", i, vm_guest_mode_string(i)); - run_test(i, use_uffd, uffd_delay, vcpus, vcpu_memory_bytes); + run_test(i, use_uffd, uffd_delay, vcpus); } return 0; diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 8e053e9dc7ea..840dc2a19ce1 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -21,6 +21,8 @@ /* Default guest test virtual memory offset */ #define DEFAULT_GUEST_TEST_MEM 0xc0000000 +#define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */ + /* * Guest physical memory offset of the testing memory slot. * This will be set to the topmost valid physical address minus @@ -33,6 +35,7 @@ static uint64_t guest_test_phys_mem; * Must not conflict with identity mapped test code. */ static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; struct vcpu_args { uint64_t gva; -- cgit v1.2.3 From 3be18630954672b889186e7be9b631f00134e954 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 4 Nov 2020 22:23:53 +0100 Subject: KVM: selftests: Make the number of vcpus global We also check the input number of vcpus against the maximum supported. Signed-off-by: Andrew Jones Message-Id: <20201104212357.171559-8-drjones@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 37 ++++++++++------------ .../testing/selftests/kvm/include/perf_test_util.h | 3 ++ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 33acd954a298..3d96a7bfaff3 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -249,7 +249,7 @@ static int setup_demand_paging(struct kvm_vm *vm, } static void run_test(enum vm_guest_mode mode, bool use_uffd, - useconds_t uffd_delay, int vcpus) + useconds_t uffd_delay) { pthread_t *vcpu_threads; pthread_t *uffd_handler_threads = NULL; @@ -261,7 +261,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, int vcpu_id; int r; - vm = create_vm(mode, vcpus, guest_percpu_mem_size); + vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size); perf_test_args.wr_fract = 1; @@ -270,23 +270,23 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, "Failed to allocate buffer for guest data pattern"); memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size); - vcpu_threads = malloc(vcpus * sizeof(*vcpu_threads)); + vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - add_vcpus(vm, vcpus, guest_percpu_mem_size); + add_vcpus(vm, nr_vcpus, guest_percpu_mem_size); if (use_uffd) { uffd_handler_threads = - malloc(vcpus * sizeof(*uffd_handler_threads)); + malloc(nr_vcpus * sizeof(*uffd_handler_threads)); TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); - uffd_args = malloc(vcpus * sizeof(*uffd_args)); + uffd_args = malloc(nr_vcpus * sizeof(*uffd_args)); TEST_ASSERT(uffd_args, "Memory allocation failed"); - pipefds = malloc(sizeof(int) * vcpus * 2); + pipefds = malloc(sizeof(int) * nr_vcpus * 2); TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vm_paddr_t vcpu_gpa; void *vcpu_hva; @@ -322,7 +322,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, clock_gettime(CLOCK_MONOTONIC, &start); - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, &perf_test_args.vcpu_args[vcpu_id]); } @@ -330,7 +330,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, pr_info("Started all vCPUs\n"); /* Wait for the vcpu threads to quit */ - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { pthread_join(vcpu_threads[vcpu_id], NULL); PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id); } @@ -343,7 +343,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, char c; /* Tell the user fault fd handler threads to quit */ - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { r = write(pipefds[vcpu_id * 2 + 1], &c, 1); TEST_ASSERT(r == 1, "Unable to write to pipefd"); @@ -354,7 +354,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, pr_info("Total guest execution time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); pr_info("Overall demand paging rate: %f pgs/sec\n", - perf_test_args.vcpu_args[0].pages * vcpus / + perf_test_args.vcpu_args[0].pages * nr_vcpus / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); ucall_uninit(vm); @@ -409,8 +409,8 @@ static void help(char *name) int main(int argc, char *argv[]) { + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); bool mode_selected = false; - int vcpus = 1; unsigned int mode; int opt, i; bool use_uffd = false; @@ -462,12 +462,9 @@ int main(int argc, char *argv[]) guest_percpu_mem_size = parse_size(optarg); break; case 'v': - vcpus = atoi(optarg); - TEST_ASSERT(vcpus > 0, - "Must have a positive number of vCPUs"); - TEST_ASSERT(vcpus <= MAX_VCPUS, - "This test does not currently support\n" - "more than %d vCPUs.", MAX_VCPUS); + nr_vcpus = atoi(optarg); + TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, + "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; case 'h': default: @@ -482,7 +479,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(guest_modes[i].supported, "Guest mode ID %d (%s) not supported.", i, vm_guest_mode_string(i)); - run_test(i, use_uffd, uffd_delay, vcpus); + run_test(i, use_uffd, uffd_delay); } return 0; diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 840dc2a19ce1..05eeb5d2bfc4 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -37,6 +37,9 @@ static uint64_t guest_test_phys_mem; static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; +/* Number of VCPUs for the test */ +static int nr_vcpus = 1; + struct vcpu_args { uint64_t gva; uint64_t pages; -- cgit v1.2.3 From 4fd94ec7d566ee2f0b52111cc6d26dd311f8a7c3 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 27 Oct 2020 16:37:33 -0700 Subject: KVM: selftests: Introduce the dirty log perf test The dirty log perf test will time verious dirty logging operations (enabling dirty logging, dirtying memory, getting the dirty log, clearing the dirty log, and disabling dirty logging) in order to quantify dirty logging performance. This test can be used to inform future performance improvements to KVM's dirty logging infrastructure. This series was tested by running the following invocations on an Intel Skylake machine: dirty_log_perf_test -b 20m -i 100 -v 64 dirty_log_perf_test -b 20g -i 5 -v 4 dirty_log_perf_test -b 4g -i 5 -v 32 demand_paging_test -b 20m -v 64 demand_paging_test -b 20g -v 4 demand_paging_test -b 4g -v 32 All behaved as expected. Signed-off-by: Ben Gardon Message-Id: <20201027233733.1484855-6-bgardon@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/dirty_log_perf_test.c | 376 +++++++++++++++++++++ .../testing/selftests/kvm/include/perf_test_util.h | 18 +- tools/testing/selftests/kvm/include/test_util.h | 1 + tools/testing/selftests/kvm/lib/test_util.c | 7 + 6 files changed, 396 insertions(+), 8 deletions(-) create mode 100644 tools/testing/selftests/kvm/dirty_log_perf_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index ea1692d43411..7a2c242b7152 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -27,6 +27,7 @@ /clear_dirty_log_test /demand_paging_test /dirty_log_test +/dirty_log_perf_test /kvm_create_max_vcpus /set_memory_region_test /steal_time diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index c8af04dccf7f..3d14ef77755e 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -61,6 +61,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test +TEST_GEN_PROGS_x86_64 += dirty_log_perf_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c new file mode 100644 index 000000000000..cda7b000b1bc --- /dev/null +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging performance test + * + * Based on dirty_log_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2020, Google, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include +#include +#include +#include +#include +#include +#include + +#include "kvm_util.h" +#include "perf_test_util.h" +#include "processor.h" +#include "test_util.h" + +/* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/ +#define TEST_HOST_LOOP_N 2UL + +/* Host variables */ +static bool host_quit; +static uint64_t iteration; +static uint64_t vcpu_last_completed_iteration[MAX_VCPUS]; + +static void *vcpu_worker(void *data) +{ + int ret; + struct kvm_vm *vm = perf_test_args.vm; + uint64_t pages_count = 0; + struct kvm_run *run; + struct timespec start; + struct timespec ts_diff; + struct timespec total = (struct timespec){0}; + struct timespec avg; + struct vcpu_args *vcpu_args = (struct vcpu_args *)data; + int vcpu_id = vcpu_args->vcpu_id; + + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + run = vcpu_state(vm, vcpu_id); + + while (!READ_ONCE(host_quit)) { + uint64_t current_iteration = READ_ONCE(iteration); + + clock_gettime(CLOCK_MONOTONIC, &start); + ret = _vcpu_run(vm, vcpu_id); + ts_diff = timespec_diff_now(start); + + TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC, + "Invalid guest sync status: exit_reason=%s\n", + exit_reason_str(run->exit_reason)); + + pr_debug("Got sync event from vCPU %d\n", vcpu_id); + vcpu_last_completed_iteration[vcpu_id] = current_iteration; + pr_debug("vCPU %d updated last completed iteration to %lu\n", + vcpu_id, vcpu_last_completed_iteration[vcpu_id]); + + if (current_iteration) { + pages_count += vcpu_args->pages; + total = timespec_add(total, ts_diff); + pr_debug("vCPU %d iteration %lu dirty memory time: %ld.%.9lds\n", + vcpu_id, current_iteration, ts_diff.tv_sec, + ts_diff.tv_nsec); + } else { + pr_debug("vCPU %d iteration %lu populate memory time: %ld.%.9lds\n", + vcpu_id, current_iteration, ts_diff.tv_sec, + ts_diff.tv_nsec); + } + + while (current_iteration == READ_ONCE(iteration) && + !READ_ONCE(host_quit)) {} + } + + avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_id]); + pr_debug("\nvCPU %d dirtied 0x%lx pages over %lu iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id], + total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec); + + return NULL; +} + +#ifdef USE_CLEAR_DIRTY_LOG +static u64 dirty_log_manual_caps; +#endif + +static void run_test(enum vm_guest_mode mode, unsigned long iterations, + uint64_t phys_offset, int wr_fract) +{ + pthread_t *vcpu_threads; + struct kvm_vm *vm; + unsigned long *bmap; + uint64_t guest_num_pages; + uint64_t host_num_pages; + int vcpu_id; + struct timespec start; + struct timespec ts_diff; + struct timespec get_dirty_log_total = (struct timespec){0}; + struct timespec vcpu_dirty_total = (struct timespec){0}; + struct timespec avg; +#ifdef USE_CLEAR_DIRTY_LOG + struct kvm_enable_cap cap = {}; + struct timespec clear_dirty_log_total = (struct timespec){0}; +#endif + + vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size); + + perf_test_args.wr_fract = wr_fract; + + guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + host_num_pages = vm_num_host_pages(mode, guest_num_pages); + bmap = bitmap_alloc(host_num_pages); + +#ifdef USE_CLEAR_DIRTY_LOG + cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; + cap.args[0] = dirty_log_manual_caps; + vm_enable_cap(vm, &cap); +#endif + + vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); + TEST_ASSERT(vcpu_threads, "Memory allocation failed"); + + add_vcpus(vm, nr_vcpus, guest_percpu_mem_size); + + sync_global_to_guest(vm, perf_test_args); + + /* Start the iterations */ + iteration = 0; + host_quit = false; + + clock_gettime(CLOCK_MONOTONIC, &start); + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, + &perf_test_args.vcpu_args[vcpu_id]); + } + + /* Allow the vCPU to populate memory */ + pr_debug("Starting iteration %lu - Populating\n", iteration); + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration) + pr_debug("Waiting for vcpu_last_completed_iteration == %lu\n", + iteration); + + ts_diff = timespec_diff_now(start); + pr_info("Populate memory time: %ld.%.9lds\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* Enable dirty logging */ + clock_gettime(CLOCK_MONOTONIC, &start); + vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, + KVM_MEM_LOG_DIRTY_PAGES); + ts_diff = timespec_diff_now(start); + pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + while (iteration < iterations) { + /* + * Incrementing the iteration number will start the vCPUs + * dirtying memory again. + */ + clock_gettime(CLOCK_MONOTONIC, &start); + iteration++; + + pr_debug("Starting iteration %lu\n", iteration); + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration) + pr_debug("Waiting for vCPU %d vcpu_last_completed_iteration == %lu\n", + vcpu_id, iteration); + } + + ts_diff = timespec_diff_now(start); + vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff); + pr_info("Iteration %lu dirty memory time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + + ts_diff = timespec_diff_now(start); + get_dirty_log_total = timespec_add(get_dirty_log_total, + ts_diff); + pr_info("Iteration %lu get dirty log time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); + +#ifdef USE_CLEAR_DIRTY_LOG + clock_gettime(CLOCK_MONOTONIC, &start); + kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, + host_num_pages); + + ts_diff = timespec_diff_now(start); + clear_dirty_log_total = timespec_add(clear_dirty_log_total, + ts_diff); + pr_info("Iteration %lu clear dirty log time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); +#endif + } + + /* Tell the vcpu thread to quit */ + host_quit = true; + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) + pthread_join(vcpu_threads[vcpu_id], NULL); + + /* Disable dirty logging */ + clock_gettime(CLOCK_MONOTONIC, &start); + vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0); + ts_diff = timespec_diff_now(start); + pr_info("Disabling dirty logging time: %ld.%.9lds\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + avg = timespec_div(get_dirty_log_total, iterations); + pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + iterations, get_dirty_log_total.tv_sec, + get_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); + +#ifdef USE_CLEAR_DIRTY_LOG + avg = timespec_div(clear_dirty_log_total, iterations); + pr_info("Clear dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + iterations, clear_dirty_log_total.tv_sec, + clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); +#endif + + free(bmap); + free(vcpu_threads); + ucall_uninit(vm); + kvm_vm_free(vm); +} + +struct guest_mode { + bool supported; + bool enabled; +}; +static struct guest_mode guest_modes[NUM_VM_MODES]; + +#define guest_mode_init(mode, supported, enabled) ({ \ + guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ +}) + +static void help(char *name) +{ + int i; + + puts(""); + printf("usage: %s [-h] [-i iterations] [-p offset] " + "[-m mode] [-b vcpu bytes] [-v vcpus]\n", name); + puts(""); + printf(" -i: specify iteration counts (default: %"PRIu64")\n", + TEST_HOST_LOOP_N); + printf(" -p: specify guest physical test memory offset\n" + " Warning: a low offset can conflict with the loaded test code.\n"); + printf(" -m: specify the guest mode ID to test " + "(default: test all supported modes)\n" + " This option may be used multiple times.\n" + " Guest mode IDs:\n"); + for (i = 0; i < NUM_VM_MODES; ++i) { + printf(" %d: %s%s\n", i, vm_guest_mode_string(i), + guest_modes[i].supported ? " (supported)" : ""); + } + printf(" -b: specify the size of the memory region which should be\n" + " dirtied by each vCPU. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + printf(" -f: specify the fraction of pages which should be written to\n" + " as opposed to simply read, in the form\n" + " 1/.\n" + " (default: 1 i.e. all pages are written to.)\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + unsigned long iterations = TEST_HOST_LOOP_N; + bool mode_selected = false; + uint64_t phys_offset = 0; + unsigned int mode; + int opt, i; + int wr_fract = 1; + +#ifdef USE_CLEAR_DIRTY_LOG + dirty_log_manual_caps = + kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + if (!dirty_log_manual_caps) { + print_skip("KVM_CLEAR_DIRTY_LOG not available"); + exit(KSFT_SKIP); + } + dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); +#endif + +#ifdef __x86_64__ + guest_mode_init(VM_MODE_PXXV48_4K, true, true); +#endif +#ifdef __aarch64__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); + guest_mode_init(VM_MODE_P40V48_64K, true, true); + + { + unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + + if (limit >= 52) + guest_mode_init(VM_MODE_P52V48_64K, true, true); + if (limit >= 48) { + guest_mode_init(VM_MODE_P48V48_4K, true, true); + guest_mode_init(VM_MODE_P48V48_64K, true, true); + } + } +#endif +#ifdef __s390x__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); +#endif + + while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:")) != -1) { + switch (opt) { + case 'i': + iterations = strtol(optarg, NULL, 10); + break; + case 'p': + phys_offset = strtoull(optarg, NULL, 0); + break; + case 'm': + if (!mode_selected) { + for (i = 0; i < NUM_VM_MODES; ++i) + guest_modes[i].enabled = false; + mode_selected = true; + } + mode = strtoul(optarg, NULL, 10); + TEST_ASSERT(mode < NUM_VM_MODES, + "Guest mode ID %d too big", mode); + guest_modes[mode].enabled = true; + break; + case 'b': + guest_percpu_mem_size = parse_size(optarg); + break; + case 'f': + wr_fract = atoi(optarg); + TEST_ASSERT(wr_fract >= 1, + "Write fraction cannot be less than one"); + break; + case 'v': + nr_vcpus = atoi(optarg); + TEST_ASSERT(nr_vcpus > 0, + "Must have a positive number of vCPUs"); + TEST_ASSERT(nr_vcpus <= MAX_VCPUS, + "This test does not currently support\n" + "more than %d vCPUs.", MAX_VCPUS); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); + + pr_info("Test iterations: %"PRIu64"\n", iterations); + + for (i = 0; i < NUM_VM_MODES; ++i) { + if (!guest_modes[i].enabled) + continue; + TEST_ASSERT(guest_modes[i].supported, + "Guest mode ID %d (%s) not supported.", + i, vm_guest_mode_string(i)); + run_test(i, iterations, phys_offset, wr_fract); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 05eeb5d2bfc4..2618052057b1 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -76,16 +76,18 @@ static void guest_code(uint32_t vcpu_id) gva = vcpu_args->gva; pages = vcpu_args->pages; - for (i = 0; i < pages; i++) { - uint64_t addr = gva + (i * perf_test_args.guest_page_size); + while (true) { + for (i = 0; i < pages; i++) { + uint64_t addr = gva + (i * perf_test_args.guest_page_size); - if (i % perf_test_args.wr_fract == 0) - *(uint64_t *)addr = 0x0123456789ABCDEF; - else - READ_ONCE(*(uint64_t *)addr); - } + if (i % perf_test_args.wr_fract == 0) + *(uint64_t *)addr = 0x0123456789ABCDEF; + else + READ_ONCE(*(uint64_t *)addr); + } - GUEST_SYNC(1); + GUEST_SYNC(1); + } } static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 1cc036ddb0c5..ffffa560436b 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -65,5 +65,6 @@ struct timespec timespec_add_ns(struct timespec ts, int64_t ns); struct timespec timespec_add(struct timespec ts1, struct timespec ts2); struct timespec timespec_sub(struct timespec ts1, struct timespec ts2); struct timespec timespec_diff_now(struct timespec start); +struct timespec timespec_div(struct timespec ts, int divisor); #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 1a46c2c48c7c..8e04c0b1608e 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -92,6 +92,13 @@ struct timespec timespec_diff_now(struct timespec start) return timespec_sub(end, start); } +struct timespec timespec_div(struct timespec ts, int divisor) +{ + int64_t ns = timespec_to_ns(ts) / divisor; + + return timespec_add_ns((struct timespec){0}, ns); +} + void print_skip(const char *fmt, ...) { va_list ap; -- cgit v1.2.3 From 3c4e0dff2095c579b142d5a0693257f1c58b4804 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Sun, 8 Nov 2020 16:38:06 +0100 Subject: vt: Disable KD_FONT_OP_COPY It's buggy: On Fri, Nov 06, 2020 at 10:30:08PM +0800, Minh Yuan wrote: > We recently discovered a slab-out-of-bounds read in fbcon in the latest > kernel ( v5.10-rc2 for now ). The root cause of this vulnerability is that > "fbcon_do_set_font" did not handle "vc->vc_font.data" and > "vc->vc_font.height" correctly, and the patch > for VT_RESIZEX can't handle this > issue. > > Specifically, we use KD_FONT_OP_SET to set a small font.data for tty6, and > use KD_FONT_OP_SET again to set a large font.height for tty1. After that, > we use KD_FONT_OP_COPY to assign tty6's vc_font.data to tty1's vc_font.data > in "fbcon_do_set_font", while tty1 retains the original larger > height. Obviously, this will cause an out-of-bounds read, because we can > access a smaller vc_font.data with a larger vc_font.height. Further there was only one user ever. - Android's loadfont, busybox and console-tools only ever use OP_GET and OP_SET - fbset documentation only mentions the kernel cmdline font: option, not anything else. - systemd used OP_COPY before release 232 published in Nov 2016 Now unfortunately the crucial report seems to have gone down with gmane, and the commit message doesn't say much. But the pull request hints at OP_COPY being broken https://github.com/systemd/systemd/pull/3651 So in other words, this never worked, and the only project which foolishly every tried to use it, realized that rather quickly too. Instead of trying to fix security issues here on dead code by adding missing checks, fix the entire thing by removing the functionality. Note that systemd code using the OP_COPY function ignored the return value, so it doesn't matter what we're doing here really - just in case a lone server somewhere happens to be extremely unlucky and running an affected old version of systemd. The relevant code from font_copy_to_all_vcs() in systemd was: /* copy font from active VT, where the font was uploaded to */ cfo.op = KD_FONT_OP_COPY; cfo.height = vcs.v_active-1; /* tty1 == index 0 */ (void) ioctl(vcfd, KDFONTOP, &cfo); Note this just disables the ioctl, garbage collecting the now unused callbacks is left for -next. v2: Tetsuo found the old mail, which allowed me to find it on another archive. Add the link too. Acked-by: Peilin Ye Reported-by: Minh Yuan References: https://lists.freedesktop.org/archives/systemd-devel/2016-June/036935.html References: https://github.com/systemd/systemd/pull/3651 Cc: Greg KH Cc: Peilin Ye Cc: Tetsuo Handa Signed-off-by: Daniel Vetter Link: https://lore.kernel.org/r/20201108153806.3140315-1-daniel.vetter@ffwll.ch Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 9506a76f3ab6..d04a162939a4 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -4704,27 +4704,6 @@ static int con_font_default(struct vc_data *vc, struct console_font_op *op) return rc; } -static int con_font_copy(struct vc_data *vc, struct console_font_op *op) -{ - int con = op->height; - int rc; - - - console_lock(); - if (vc->vc_mode != KD_TEXT) - rc = -EINVAL; - else if (!vc->vc_sw->con_font_copy) - rc = -ENOSYS; - else if (con < 0 || !vc_cons_allocated(con)) - rc = -ENOTTY; - else if (con == vc->vc_num) /* nothing to do */ - rc = 0; - else - rc = vc->vc_sw->con_font_copy(vc, con); - console_unlock(); - return rc; -} - int con_font_op(struct vc_data *vc, struct console_font_op *op) { switch (op->op) { @@ -4735,7 +4714,8 @@ int con_font_op(struct vc_data *vc, struct console_font_op *op) case KD_FONT_OP_SET_DEFAULT: return con_font_default(vc, op); case KD_FONT_OP_COPY: - return con_font_copy(vc, op); + /* was buggy and never really used */ + return -EINVAL; } return -ENOSYS; } -- cgit v1.2.3 From b4e00444cab4c3f3fec876dc0cccc8cbb0d1a948 Mon Sep 17 00:00:00 2001 From: Eddy Wu Date: Sat, 7 Nov 2020 14:47:22 +0800 Subject: fork: fix copy_process(CLONE_PARENT) race with the exiting ->real_parent current->group_leader->exit_signal may change during copy_process() if current->real_parent exits. Move the assignment inside tasklist_lock to avoid the race. Signed-off-by: Eddy Wu Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..6d266388d380 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2167,14 +2167,9 @@ static __latent_entropy struct task_struct *copy_process( /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { - p->exit_signal = -1; p->group_leader = current->group_leader; p->tgid = current->tgid; } else { - if (clone_flags & CLONE_PARENT) - p->exit_signal = current->group_leader->exit_signal; - else - p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -2218,9 +2213,14 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else + p->exit_signal = current->group_leader->exit_signal; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; + p->exit_signal = args->exit_signal; } klp_copy_process(p); -- cgit v1.2.3 From ae2975046dbc65855c217fe6fbd5b33140c5ff18 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 6 Nov 2020 15:50:39 -0500 Subject: net/sunrpc: fix useless comparison in proc_do_xprt() In the original code, the "if (*lenp < 0)" check didn't work because "*lenp" is unsigned. Fortunately, the memory_read_from_buffer() call will never fail in this context so it doesn't affect runtime. Signed-off-by: Dan Carpenter Signed-off-by: J. Bruce Fields --- net/sunrpc/sysctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 5c9f5bca4d99..3aad6ef18504 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -63,19 +63,20 @@ static int proc_do_xprt(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; - size_t len; + ssize_t len; if (write || *ppos) { *lenp = 0; return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - *lenp = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + len = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); - if (*lenp < 0) { + if (len < 0) { *lenp = 0; return -EINVAL; } + *lenp = len; return 0; } -- cgit v1.2.3 From f8394f232b1eab649ce2df5c5f15b0e528c92091 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 8 Nov 2020 16:10:16 -0800 Subject: Linux 5.10-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9e7fd6a065a7..008aba5f1a20 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Kleptomaniac Octopus # *DOCUMENTATION* -- cgit v1.2.3 From 6d6a18fdde8b86b919b740ad629153de432d12a8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 9 Nov 2020 09:45:17 -0500 Subject: KVM: selftests: allow two iterations of dirty_log_perf_test Even though one iteration is not enough for the dirty log performance test (due to the cost of building page tables, zeroing memory etc.) two is okay and it is the default. Without this patch, "./dirty_log_perf_test" without any further arguments fails. Cc: Ben Gardon Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_perf_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index cda7b000b1bc..85c9b8f73142 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -359,7 +359,7 @@ int main(int argc, char *argv[]) } } - TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); + TEST_ASSERT(iterations >= 2, "The test should have at least two iterations"); pr_info("Test iterations: %"PRIu64"\n", iterations); -- cgit v1.2.3 From 1bd3387979bff49cb3115c497895d78ffd5092e3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 29 Oct 2020 21:32:41 +0200 Subject: Documentation: firmware-guide: gpio-properties: Fix factual mistakes Fix factual mistakes and style issues in GPIO properties document. This converts IoRestriction from InputOnly to OutputOnly as pins in the example are used as outputs. Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- .../firmware-guide/acpi/gpio-properties.rst | 29 +++++++++++----------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/Documentation/firmware-guide/acpi/gpio-properties.rst b/Documentation/firmware-guide/acpi/gpio-properties.rst index bb6d74f23ee0..e6e65ceb2ca1 100644 --- a/Documentation/firmware-guide/acpi/gpio-properties.rst +++ b/Documentation/firmware-guide/acpi/gpio-properties.rst @@ -20,9 +20,9 @@ index, like the ASL example below shows:: Name (_CRS, ResourceTemplate () { - GpioIo (Exclusive, PullUp, 0, 0, IoRestrictionInputOnly, + GpioIo (Exclusive, PullUp, 0, 0, IoRestrictionOutputOnly, "\\_SB.GPO0", 0, ResourceConsumer) {15} - GpioIo (Exclusive, PullUp, 0, 0, IoRestrictionInputOnly, + GpioIo (Exclusive, PullUp, 0, 0, IoRestrictionOutputOnly, "\\_SB.GPO0", 0, ResourceConsumer) {27, 31} }) @@ -49,7 +49,7 @@ index pin Pin in the GpioIo()/GpioInt() resource. Typically this is zero. active_low - If 1 the GPIO is marked as active_low. + If 1, the GPIO is marked as active_low. Since ACPI GpioIo() resource does not have a field saying whether it is active low or high, the "active_low" argument can be used here. Setting @@ -112,8 +112,8 @@ Example:: Package () { "gpio-line-names", Package () { - "SPI0_CS_N", "EXP2_INT", "MUX6_IO", "UART0_RXD", "MUX7_IO", - "LVL_C_A1", "MUX0_IO", "SPI1_MISO" + "SPI0_CS_N", "EXP2_INT", "MUX6_IO", "UART0_RXD", + "MUX7_IO", "LVL_C_A1", "MUX0_IO", "SPI1_MISO", } } @@ -137,7 +137,7 @@ to the GPIO lines it is going to use and provide the GPIO subsystem with a mapping between those names and the ACPI GPIO resources corresponding to them. To do that, the driver needs to define a mapping table as a NULL-terminated -array of struct acpi_gpio_mapping objects that each contain a name, a pointer +array of struct acpi_gpio_mapping objects that each contains a name, a pointer to an array of line data (struct acpi_gpio_params) objects and the size of that array. Each struct acpi_gpio_params object consists of three fields, crs_entry_index, line_index, active_low, representing the index of the target @@ -154,13 +154,14 @@ question would look like this:: static const struct acpi_gpio_mapping bluetooth_acpi_gpios[] = { { "reset-gpios", &reset_gpio, 1 }, { "shutdown-gpios", &shutdown_gpio, 1 }, - { }, + { } }; Next, the mapping table needs to be passed as the second argument to -acpi_dev_add_driver_gpios() that will register it with the ACPI device object -pointed to by its first argument. That should be done in the driver's .probe() -routine. On removal, the driver should unregister its GPIO mapping table by +acpi_dev_add_driver_gpios() or its managed analogue that will +register it with the ACPI device object pointed to by its first +argument. That should be done in the driver's .probe() routine. +On removal, the driver should unregister its GPIO mapping table by calling acpi_dev_remove_driver_gpios() on the ACPI device object where that table was previously registered. @@ -191,12 +192,12 @@ The driver might expect to get the right GPIO when it does:: but since there is no way to know the mapping between "reset" and the GpioIo() in _CRS desc will hold ERR_PTR(-ENOENT). -The driver author can solve this by passing the mapping explictly -(the recommended way and documented in the above chapter). +The driver author can solve this by passing the mapping explicitly +(this is the recommended way and it's documented in the above chapter). The ACPI GPIO mapping tables should not contaminate drivers that are not knowing about which exact device they are servicing on. It implies that -the ACPI GPIO mapping tables are hardly linked to ACPI ID and certain +the ACPI GPIO mapping tables are hardly linked to an ACPI ID and certain objects, as listed in the above chapter, of the device in question. Getting GPIO descriptor @@ -229,5 +230,5 @@ Case 2 explicitly tells GPIO core to look for resources in _CRS. Be aware that gpiod_get_index() in cases 1 and 2, assuming that there are two versions of ACPI device description provided and no mapping is present in the driver, will return different resources. That's why a -certain driver has to handle them carefully as explained in previous +certain driver has to handle them carefully as explained in the previous chapter. -- cgit v1.2.3 From 0d6c41cf801fd56b92f4359374667061d27a6472 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 29 Oct 2020 21:32:42 +0200 Subject: Documentation: firmware-guide: gpio-properties: active_low only for GpioIo() It appears that people may misinterpret active_low field in _DSD for GpioInt() resource. Add a paragraph to clarify this. Reported-by: Ricardo Ribalda Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg Reviewed-by: Ricardo Ribalda Signed-off-by: Rafael J. Wysocki --- Documentation/firmware-guide/acpi/gpio-properties.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/firmware-guide/acpi/gpio-properties.rst b/Documentation/firmware-guide/acpi/gpio-properties.rst index e6e65ceb2ca1..370fe46c6af9 100644 --- a/Documentation/firmware-guide/acpi/gpio-properties.rst +++ b/Documentation/firmware-guide/acpi/gpio-properties.rst @@ -55,6 +55,9 @@ Since ACPI GpioIo() resource does not have a field saying whether it is active low or high, the "active_low" argument can be used here. Setting it to 1 marks the GPIO as active low. +Note, active_low in _DSD does not make sense for GpioInt() resource and +must be 0. GpioInt() resource has its own means of defining it. + In our Bluetooth example the "reset-gpios" refers to the second GpioIo() resource, second pin in that resource with the GPIO number of 31. -- cgit v1.2.3 From 8b31e972f9872e5a6a3348506b5b84353fecef58 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 29 Oct 2020 21:32:43 +0200 Subject: Documentation: firmware-guide: gpio-properties: Clarify initial output state GpioIo() doesn't provide an explicit state for an output pin. Linux tries to be smart and uses a common sense based on other parameters. Document how it looks like in the code. Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- .../firmware-guide/acpi/gpio-properties.rst | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/Documentation/firmware-guide/acpi/gpio-properties.rst b/Documentation/firmware-guide/acpi/gpio-properties.rst index 370fe46c6af9..59aad6138b6e 100644 --- a/Documentation/firmware-guide/acpi/gpio-properties.rst +++ b/Documentation/firmware-guide/acpi/gpio-properties.rst @@ -61,6 +61,29 @@ must be 0. GpioInt() resource has its own means of defining it. In our Bluetooth example the "reset-gpios" refers to the second GpioIo() resource, second pin in that resource with the GPIO number of 31. +The GpioIo() resource unfortunately doesn't explicitly provide an initial +state of the output pin which driver should use during its initialization. + +Linux tries to use common sense here and derives the state from the bias +and polarity settings. The table below shows the expectations: + +========= ============= ============== +Pull Bias Polarity Requested... +========= ============= ============== +Implicit x AS IS (assumed firmware configured for us) +Explicit x (no _DSD) as Pull Bias (Up == High, Down == Low), + assuming non-active (Polarity = !Pull Bias) +Down Low as low, assuming active +Down High as low, assuming non-active +Up Low as high, assuming non-active +Up High as high, assuming active +========= ============= ============== + +That said, for our above example the both GPIOs, since the bias setting +is explicit and _DSD is present, will be treated as active with a high +polarity and Linux will configure the pins in this state until a driver +reprograms them differently. + It is possible to leave holes in the array of GPIOs. This is useful in cases like with SPI host controllers where some chip selects may be implemented as GPIOs and some as native signals. For example a SPI host -- cgit v1.2.3 From c1e9735975c05d36ca97e9d39e9b06c3e0b3b0d7 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 2 Nov 2020 19:19:31 +0800 Subject: ACPI: scan: Fix acpi_dma_configure_id() kerneldoc name For some reason building with W=1 doesn't pick up on this, but the kerneldoc name for acpi_dma_configure_id() is not right, so fix it up. Signed-off-by: John Garry Acked-by: Lorenzo Pieralisi Signed-off-by: Rafael J. Wysocki --- drivers/acpi/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index a896e5e87c93..bc6a79e33220 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1453,7 +1453,7 @@ int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset, } /** - * acpi_dma_configure - Set-up DMA configuration for the device. + * acpi_dma_configure_id - Set-up DMA configuration for the device. * @dev: The pointer to the device * @attr: device dma attributes * @input_id: input device id const value pointer -- cgit v1.2.3 From c6237b210ddc4f026a368172e957cbd3d5b5c78a Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Thu, 5 Nov 2020 03:06:00 +0100 Subject: ACPI: Fix whitespace inconsistencies Replaces spaces with tabs where spaces have been (inconsistently) used for indentation and removes trailing whitespaces. Signed-off-by: Maximilian Luz Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_video.c | 6 +++--- drivers/acpi/battery.c | 2 +- drivers/acpi/event.c | 2 +- drivers/acpi/internal.h | 2 +- drivers/acpi/nfit/core.c | 10 +++++----- drivers/acpi/pci_irq.c | 2 +- drivers/acpi/pci_link.c | 12 ++++++------ drivers/acpi/pci_mcfg.c | 2 +- drivers/acpi/power.c | 6 +++--- drivers/acpi/processor_perflib.c | 6 +++--- drivers/acpi/sbs.c | 2 +- drivers/acpi/sbshc.c | 2 +- drivers/acpi/sbshc.h | 6 +++--- drivers/acpi/video_detect.c | 16 ++++++++-------- drivers/acpi/wakeup.c | 4 ++-- 15 files changed, 40 insertions(+), 40 deletions(-) diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index bc96457c9e25..a322a7bd286b 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -578,7 +578,7 @@ acpi_video_bqc_value_to_level(struct acpi_video_device *device, ACPI_VIDEO_FIRST_LEVEL - 1 - bqc_value; level = device->brightness->levels[bqc_value + - ACPI_VIDEO_FIRST_LEVEL]; + ACPI_VIDEO_FIRST_LEVEL]; } else { level = bqc_value; } @@ -990,8 +990,8 @@ set_level: goto out_free_levels; ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "found %d brightness levels\n", - br->count - ACPI_VIDEO_FIRST_LEVEL)); + "found %d brightness levels\n", + br->count - ACPI_VIDEO_FIRST_LEVEL)); return 0; out_free_levels: diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index cab4af532f36..08ee1c7b12e0 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -987,7 +987,7 @@ static int acpi_battery_update(struct acpi_battery *battery, bool resume) */ if ((battery->state & ACPI_BATTERY_STATE_CRITICAL) || (test_bit(ACPI_BATTERY_ALARM_PRESENT, &battery->flags) && - (battery->capacity_now <= battery->alarm))) + (battery->capacity_now <= battery->alarm))) acpi_pm_wakeup_event(&battery->device->dev); return result; diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c index 170643927044..92e59f45329b 100644 --- a/drivers/acpi/event.c +++ b/drivers/acpi/event.c @@ -31,7 +31,7 @@ int acpi_notifier_call_chain(struct acpi_device *dev, u32 type, u32 data) event.type = type; event.data = data; return (blocking_notifier_call_chain(&acpi_chain_head, 0, (void *)&event) - == NOTIFY_BAD) ? -EINVAL : 0; + == NOTIFY_BAD) ? -EINVAL : 0; } EXPORT_SYMBOL(acpi_notifier_call_chain); diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 43411a7457cd..e3638bafb941 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -134,7 +134,7 @@ int acpi_add_power_resource(acpi_handle handle); void acpi_power_add_remove_device(struct acpi_device *adev, bool add); int acpi_power_wakeup_list_init(struct list_head *list, int *system_level); int acpi_device_sleep_wake(struct acpi_device *dev, - int enable, int sleep_state, int dev_state); + int enable, int sleep_state, int dev_state); int acpi_power_get_inferred_state(struct acpi_device *device, int *state); int acpi_power_on_resources(struct acpi_device *device, int state); int acpi_power_transition(struct acpi_device *device, int state); diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 3a3c209ed3d3..442608220b5c 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -2175,10 +2175,10 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) * these commands. */ enum nfit_aux_cmds { - NFIT_CMD_TRANSLATE_SPA = 5, - NFIT_CMD_ARS_INJECT_SET = 7, - NFIT_CMD_ARS_INJECT_CLEAR = 8, - NFIT_CMD_ARS_INJECT_GET = 9, + NFIT_CMD_TRANSLATE_SPA = 5, + NFIT_CMD_ARS_INJECT_SET = 7, + NFIT_CMD_ARS_INJECT_CLEAR = 8, + NFIT_CMD_ARS_INJECT_GET = 9, }; static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) @@ -2632,7 +2632,7 @@ static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus, nfit_blk->bdw_offset = nfit_mem->bdw->offset; mmio = &nfit_blk->mmio[BDW]; mmio->addr.base = devm_nvdimm_memremap(dev, nfit_mem->spa_bdw->address, - nfit_mem->spa_bdw->length, nd_blk_memremap_flags(ndbr)); + nfit_mem->spa_bdw->length, nd_blk_memremap_flags(ndbr)); if (!mmio->addr.base) { dev_dbg(dev, "%s failed to map bdw\n", nvdimm_name(nvdimm)); diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index dea8a60e18a4..14ee631cb7cf 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -175,7 +175,7 @@ static int acpi_pci_irq_check_entry(acpi_handle handle, struct pci_dev *dev, * configure the IRQ assigned to this slot|dev|pin. The 'source_index' * indicates which resource descriptor in the resource template (of * the link device) this interrupt is allocated from. - * + * * NOTE: Don't query the Link Device for IRQ information at this time * because Link Device enumeration may not have occurred yet * (e.g. exists somewhere 'below' this _PRT entry in the ACPI diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 606da5d77ad3..fb4c5632a232 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -6,8 +6,8 @@ * Copyright (C) 2001, 2002 Paul Diefenbaugh * Copyright (C) 2002 Dominik Brodowski * - * TBD: - * 1. Support more than one IRQ resource entry per link device (index). + * TBD: + * 1. Support more than one IRQ resource entry per link device (index). * 2. Implement start/stop mechanism and use ACPI Bus Driver facilities * for IRQ management (e.g. start()->_SRS). */ @@ -249,8 +249,8 @@ static int acpi_pci_link_get_current(struct acpi_pci_link *link) } } - /* - * Query and parse _CRS to get the current IRQ assignment. + /* + * Query and parse _CRS to get the current IRQ assignment. */ status = acpi_walk_resources(link->device->handle, METHOD_NAME__CRS, @@ -396,7 +396,7 @@ static int acpi_pci_link_set(struct acpi_pci_link *link, int irq) /* * "acpi_irq_balance" (default in APIC mode) enables ACPI to use PIC Interrupt * Link Devices to move the PIRQs around to minimize sharing. - * + * * "acpi_irq_nobalance" (default in PIC mode) tells ACPI not to move any PIC IRQs * that the BIOS has already set to active. This is necessary because * ACPI has no automatic means of knowing what ISA IRQs are used. Note that @@ -414,7 +414,7 @@ static int acpi_pci_link_set(struct acpi_pci_link *link, int irq) * * Note that PCI IRQ routers have a list of possible IRQs, * which may not include the IRQs this table says are available. - * + * * Since this heuristic can't tell the difference between a link * that no device will attach to, vs. a link which may be shared * by multiple active devices -- it is not optimal. diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c index 7ddd57abadd1..95f23acd5b80 100644 --- a/drivers/acpi/pci_mcfg.c +++ b/drivers/acpi/pci_mcfg.c @@ -173,7 +173,7 @@ static int pci_mcfg_quirk_matches(struct mcfg_fixup *f, u16 segment, { if (!memcmp(f->oem_id, mcfg_oem_id, ACPI_OEM_ID_SIZE) && !memcmp(f->oem_table_id, mcfg_oem_table_id, - ACPI_OEM_TABLE_ID_SIZE) && + ACPI_OEM_TABLE_ID_SIZE) && f->oem_revision == mcfg_oem_revision && f->segment == segment && resource_contains(&f->bus_range, bus_range)) diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index 837b875d075e..8048da85b7e0 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -13,7 +13,7 @@ * 1. via "Device Specific (D-State) Control" * 2. via "Power Resource Control". * The code below deals with ACPI Power Resources control. - * + * * An ACPI "power resource object" represents a software controllable power * plane, clock plane, or other resource depended on by a device. * @@ -645,7 +645,7 @@ int acpi_power_wakeup_list_init(struct list_head *list, int *system_level_p) * -ENODEV if the execution of either _DSW or _PSW has failed */ int acpi_device_sleep_wake(struct acpi_device *dev, - int enable, int sleep_state, int dev_state) + int enable, int sleep_state, int dev_state) { union acpi_object in_arg[3]; struct acpi_object_list arg_list = { 3, in_arg }; @@ -690,7 +690,7 @@ int acpi_device_sleep_wake(struct acpi_device *dev, /* * Prepare a wakeup device, two steps (Ref ACPI 2.0:P229): - * 1. Power on the power resources required for the wakeup device + * 1. Power on the power resources required for the wakeup device * 2. Execute _DSW (Device Sleep Wake) or (deprecated in ACPI 3.0) _PSW (Power * State Wake) for the device, if present */ diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 5909e8fa4013..b04a68950ff1 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -354,7 +354,7 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr) (u32) px->control, (u32) px->status)); /* - * Check that ACPI's u64 MHz will be valid as u32 KHz in cpufreq + * Check that ACPI's u64 MHz will be valid as u32 KHz in cpufreq */ if (!px->core_frequency || ((u32)(px->core_frequency * 1000) != @@ -627,7 +627,7 @@ int acpi_processor_preregister_performance( goto err_ret; /* - * Now that we have _PSD data from all CPUs, lets setup P-state + * Now that we have _PSD data from all CPUs, lets setup P-state * domain info. */ for_each_possible_cpu(i) { @@ -693,7 +693,7 @@ int acpi_processor_preregister_performance( if (match_pdomain->domain != pdomain->domain) continue; - match_pr->performance->shared_type = + match_pr->performance->shared_type = pr->performance->shared_type; cpumask_copy(match_pr->performance->shared_cpu_map, pr->performance->shared_cpu_map); diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index f158b8c30113..e6d9f4de2800 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -366,7 +366,7 @@ static int acpi_battery_get_state(struct acpi_battery *battery) state_readers[i].mode, ACPI_SBS_BATTERY, state_readers[i].command, - (u8 *)battery + + (u8 *)battery + state_readers[i].offset); if (result) goto end; diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c index 87b74e9015e5..53c2862c4c75 100644 --- a/drivers/acpi/sbshc.c +++ b/drivers/acpi/sbshc.c @@ -176,7 +176,7 @@ int acpi_smbus_write(struct acpi_smb_hc *hc, u8 protocol, u8 address, EXPORT_SYMBOL_GPL(acpi_smbus_write); int acpi_smbus_register_callback(struct acpi_smb_hc *hc, - smbus_alarm_callback callback, void *context) + smbus_alarm_callback callback, void *context) { mutex_lock(&hc->lock); hc->callback = callback; diff --git a/drivers/acpi/sbshc.h b/drivers/acpi/sbshc.h index c3522bb82792..695c390e2884 100644 --- a/drivers/acpi/sbshc.h +++ b/drivers/acpi/sbshc.h @@ -24,9 +24,9 @@ enum acpi_sbs_device_addr { typedef void (*smbus_alarm_callback)(void *context); extern int acpi_smbus_read(struct acpi_smb_hc *hc, u8 protocol, u8 address, - u8 command, u8 * data); + u8 command, u8 *data); extern int acpi_smbus_write(struct acpi_smb_hc *hc, u8 protocol, u8 slave_address, - u8 command, u8 * data, u8 length); + u8 command, u8 *data, u8 length); extern int acpi_smbus_register_callback(struct acpi_smb_hc *hc, - smbus_alarm_callback callback, void *context); + smbus_alarm_callback callback, void *context); extern int acpi_smbus_unregister_callback(struct acpi_smb_hc *hc); diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index 3a032afd9d05..4f5463b2a217 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -178,14 +178,14 @@ static const struct dmi_system_id video_detect_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad X201s"), }, }, - { - .callback = video_detect_force_video, - .ident = "ThinkPad X201T", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad X201T"), - }, - }, + { + .callback = video_detect_force_video, + .ident = "ThinkPad X201T", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad X201T"), + }, + }, /* The native backlight controls do not work on some older machines */ { diff --git a/drivers/acpi/wakeup.c b/drivers/acpi/wakeup.c index f89dd9a99e6e..b02bf770aead 100644 --- a/drivers/acpi/wakeup.c +++ b/drivers/acpi/wakeup.c @@ -44,7 +44,7 @@ void acpi_enable_wakeup_devices(u8 sleep_state) if (!dev->wakeup.flags.valid || sleep_state > (u32) dev->wakeup.sleep_state || !(device_may_wakeup(&dev->dev) - || dev->wakeup.prepare_count)) + || dev->wakeup.prepare_count)) continue; if (device_may_wakeup(&dev->dev)) @@ -69,7 +69,7 @@ void acpi_disable_wakeup_devices(u8 sleep_state) if (!dev->wakeup.flags.valid || sleep_state > (u32) dev->wakeup.sleep_state || !(device_may_wakeup(&dev->dev) - || dev->wakeup.prepare_count)) + || dev->wakeup.prepare_count)) continue; acpi_set_gpe_wake_mask(dev->wakeup.gpe_device, dev->wakeup.gpe_number, -- cgit v1.2.3 From 9debfb81e7654fe7388a49f45bc4d789b94c1103 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 7 Nov 2020 00:49:39 -0800 Subject: ACPI: GED: fix -Wformat Clang is more aggressive about -Wformat warnings when the format flag specifies a type smaller than the parameter. It turns out that gsi is an int. Fixes: drivers/acpi/evged.c:105:48: warning: format specifies type 'unsigned char' but the argument has type 'unsigned int' [-Wformat] trigger == ACPI_EDGE_SENSITIVE ? 'E' : 'L', gsi); ^~~ Link: https://github.com/ClangBuiltLinux/linux/issues/378 Fixes: ea6f3af4c5e6 ("ACPI: GED: add support for _Exx / _Lxx handler methods") Acked-by: Ard Biesheuvel Signed-off-by: Nick Desaulniers Signed-off-by: Rafael J. Wysocki --- drivers/acpi/evged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/evged.c b/drivers/acpi/evged.c index b1a7f8d6965e..fe6b6792c8bb 100644 --- a/drivers/acpi/evged.c +++ b/drivers/acpi/evged.c @@ -101,7 +101,7 @@ static acpi_status acpi_ged_request_interrupt(struct acpi_resource *ares, switch (gsi) { case 0 ... 255: - sprintf(ev_name, "_%c%02hhX", + sprintf(ev_name, "_%c%02X", trigger == ACPI_EDGE_SENSITIVE ? 'E' : 'L', gsi); if (ACPI_SUCCESS(acpi_get_handle(handle, ev_name, &evt_handle))) -- cgit v1.2.3 From 7daaa06357bf7f1874b62bb1ea9d66a51d4e567e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 7 Nov 2020 14:32:54 +0100 Subject: ACPI: button: Add DMI quirk for Medion Akoya E2228T The Medion Akoya E2228T's ACPI _LID implementation is quite broken, it has the same issues as the one from the Medion Akoya E2215T: 1. For notifications it uses an ActiveLow Edge GpioInt, rather then an ActiveBoth one, meaning that the device is only notified when the lid is closed, not when it is opened. 2. Matching with this its _LID method simply always returns 0 (closed) In order for the Linux LID code to work properly with this implementation, the lid_init_state selection needs to be set to ACPI_BUTTON_LID_INIT_OPEN, add a DMI quirk for this. While working on this I also found out that the MD60### part of the model number differs per country/batch while all of the E2215T and E2228T models have this issue, so also remove the " MD60198" part from the E2215T quirk. Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/button.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index 0761529cac05..0d93a5ef4d07 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -89,7 +89,18 @@ static const struct dmi_system_id dmi_lid_quirks[] = { */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "MEDION"), - DMI_MATCH(DMI_PRODUCT_NAME, "E2215T MD60198"), + DMI_MATCH(DMI_PRODUCT_NAME, "E2215T"), + }, + .driver_data = (void *)(long)ACPI_BUTTON_LID_INIT_OPEN, + }, + { + /* + * Medion Akoya E2228T, notification of the LID device only + * happens on close, not on open and _LID always returns closed. + */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "MEDION"), + DMI_MATCH(DMI_PRODUCT_NAME, "E2228T"), }, .driver_data = (void *)(long)ACPI_BUTTON_LID_INIT_OPEN, }, -- cgit v1.2.3 From 8ef9ba4d666614497a057d09b0a6eafc1e34eadf Mon Sep 17 00:00:00 2001 From: Oliver Herms Date: Tue, 3 Nov 2020 11:41:33 +0100 Subject: IPv6: Set SIT tunnel hard_header_len to zero Due to the legacy usage of hard_header_len for SIT tunnels while already using infrastructure from net/ipv4/ip_tunnel.c the calculation of the path MTU in tnl_update_pmtu is incorrect. This leads to unnecessary creation of MTU exceptions for any flow going over a SIT tunnel. As SIT tunnels do not have a header themsevles other than their transport (L3, L2) headers we're leaving hard_header_len set to zero as tnl_update_pmtu is already taking care of the transport headers sizes. This will also help avoiding unnecessary IPv6 GC runs and spinlock contention seen when using SIT tunnels and for more than net.ipv6.route.gc_thresh flows. Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Oliver Herms Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/20201103104133.GA1573211@tws Signed-off-by: Jakub Kicinski --- net/ipv6/sit.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 5e2c34c0ac97..5e7983cb6154 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1128,7 +1128,6 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) if (tdev && !netif_is_l3_master(tdev)) { int t_hlen = tunnel->hlen + sizeof(struct iphdr); - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); dev->mtu = tdev->mtu - t_hlen; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; @@ -1426,7 +1425,6 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->priv_destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; - dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - t_hlen; -- cgit v1.2.3 From 77a2d673d5c9d1d359b5652ff75043273c5dea28 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 6 Nov 2020 17:59:52 +0100 Subject: tunnels: Fix off-by-one in lower MTU bounds for ICMP/ICMPv6 replies Jianlin reports that a bridged IPv6 VXLAN endpoint, carrying IPv6 packets over a link with a PMTU estimation of exactly 1350 bytes, won't trigger ICMPv6 Packet Too Big replies when the encapsulated datagrams exceed said PMTU value. VXLAN over IPv6 adds 70 bytes of overhead, so an ICMPv6 reply indicating 1280 bytes as inner MTU would be legitimate and expected. This comes from an off-by-one error I introduced in checks added as part of commit 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets"), whose purpose was to prevent sending ICMPv6 Packet Too Big messages with an MTU lower than the smallest permissible IPv6 link MTU, i.e. 1280 bytes. In iptunnel_pmtud_check_icmpv6(), avoid triggering a reply only if the advertised MTU would be less than, and not equal to, 1280 bytes. Also fix the analogous comparison for IPv4, that is, skip the ICMP reply only if the resulting MTU is strictly less than 576 bytes. This becomes apparent while running the net/pmtu.sh bridged VXLAN or GENEVE selftests with adjusted lower-link MTU values. Using e.g. GENEVE, setting ll_mtu to the values reported below, in the test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() test function, we can see failures on the following tests: test | ll_mtu -------------------------------|-------- pmtu_ipv4_br_geneve4_exception | 626 pmtu_ipv6_br_geneve4_exception | 1330 pmtu_ipv6_br_geneve6_exception | 1350 owing to the different tunneling overheads implied by the corresponding configurations. Reported-by: Jianlin Shi Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Signed-off-by: Stefano Brivio Link: https://lore.kernel.org/r/4f5fc2f33bfdf8409549fafd4f952b008bf04d63.1604681709.git.sbrivio@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_tunnel_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 25f1caf5abf9..e25be2d01a7a 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -263,7 +263,7 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) const struct icmphdr *icmph = icmp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); - if (mtu <= 576 || iph->frag_off != htons(IP_DF)) + if (mtu < 576 || iph->frag_off != htons(IP_DF)) return 0; if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) || @@ -359,7 +359,7 @@ static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu) __be16 frag_off; int offset; - if (mtu <= IPV6_MIN_MTU) + if (mtu < IPV6_MIN_MTU) return 0; if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST || -- cgit v1.2.3 From 413691384a37fe27f43460226c4160e33140e638 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 8 Nov 2020 00:46:15 +0000 Subject: ethtool: netlink: add missing netdev_features_change() call After updating userspace Ethtool from 5.7 to 5.9, I noticed that NETDEV_FEAT_CHANGE is no more raised when changing netdev features through Ethtool. That's because the old Ethtool ioctl interface always calls netdev_features_change() at the end of user request processing to inform the kernel that our netdevice has some features changed, but the new Netlink interface does not. Instead, it just notifies itself with ETHTOOL_MSG_FEATURES_NTF. Replace this ethtool_notify() call with netdev_features_change(), so the kernel will be aware of any features changes, just like in case with the ioctl interface. This does not omit Ethtool notifications, as Ethtool itself listens to NETDEV_FEAT_CHANGE and drops ETHTOOL_MSG_FEATURES_NTF on it (net/ethtool/netlink.c:ethnl_netdev_event()). From v1 [1]: - dropped extra new line as advised by Jakub; - no functional changes. [1] https://lore.kernel.org/netdev/AlZXQ2o5uuTVHCfNGOiGgJ8vJ3KgO5YIWAnQjH0cDE@cp3-web-009.plabs.ch Fixes: 0980bfcd6954 ("ethtool: set netdev features with FEATURES_SET request") Signed-off-by: Alexander Lobakin Reviewed-by: Michal Kubecek Link: https://lore.kernel.org/r/ahA2YWXYICz5rbUSQqNG4roJ8OlJzzYQX7PTiG80@cp4-web-028.plabs.ch Signed-off-by: Jakub Kicinski --- net/ethtool/features.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/features.c b/net/ethtool/features.c index 8ee4cdbd6b82..1c9f4df273bd 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -280,7 +280,7 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) active_diff_mask, compact); } if (mod) - ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); + netdev_features_change(dev); out_rtnl: rtnl_unlock(); -- cgit v1.2.3 From 16eb0eb835c77c5e8824b8aa90b11b00ddc5c122 Mon Sep 17 00:00:00 2001 From: Jonathan Neuschäfer Date: Sat, 7 Nov 2020 23:08:21 +0100 Subject: docs: networking: phy: s/2.5 times faster/2.5 times as fast/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2.5 times faster would be 3.5 Gbps (4.375 Gbaud after 8b/10b encoding). Signed-off-by: Jonathan Neuschäfer Link: https://lore.kernel.org/r/20201107220822.1291215-1-j.neuschaefer@gmx.net Signed-off-by: Jakub Kicinski --- Documentation/networking/phy.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 256106054c8c..b2f7ec794bc8 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -247,8 +247,8 @@ Some of the interface modes are described below: speeds (see below.) ``PHY_INTERFACE_MODE_2500BASEX`` - This defines a variant of 1000BASE-X which is clocked 2.5 times faster, - than the 802.3 standard giving a fixed bit rate of 3.125Gbaud. + This defines a variant of 1000BASE-X which is clocked 2.5 times as fast + as the 802.3 standard, giving a fixed bit rate of 3.125Gbaud. ``PHY_INTERFACE_MODE_SGMII`` This is used for Cisco SGMII, which is a modification of 1000BASE-X -- cgit v1.2.3 From 989ef49bdf100cc772b3a8737089df36b1ab1e30 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sun, 8 Nov 2020 19:49:59 +0100 Subject: mptcp: provide rmem[0] limit The mptcp proto struct currently does not provide the required limit for forward memory scheduling. Under pressure sk_rmem_schedule() will unconditionally try to use such field and will oops. Address the issue inheriting the tcp limit, as we already do for the wmem one. Fixes: 9c3f94e1681b ("mptcp: add missing memory scheduling in the rx path") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Link: https://lore.kernel.org/r/37af798bd46f402fb7c79f57ebbdd00614f5d7fa.1604861097.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e7419fd15d84..88f2a7a0ccb8 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2467,6 +2467,7 @@ static struct proto mptcp_prot = { .memory_pressure = &tcp_memory_pressure, .stream_memory_free = mptcp_memory_free, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, -- cgit v1.2.3 From 949dd0104c496fa7c14991a23c03c62e44637e71 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 10 Nov 2020 13:00:00 -0800 Subject: powercap: restrict energy meter to root access Remove non-privileged user access to power data contained in /sys/class/powercap/intel-rapl*/*/energy_uj Non-privileged users currently have read access to power data and can use this data to form a security attack. Some privileged drivers/applications need read access to this data, but don't expose it to non-privileged users. For example, thermald uses this data to ensure that power management works correctly. Thus removing non-privileged access is preferred over completely disabling this power reporting capability with CONFIG_INTEL_RAPL=n. Fixes: 95677a9a3847 ("PowerCap: Fix mode for energy counter") Signed-off-by: Len Brown Cc: stable@vger.kernel.org --- drivers/powercap/powercap_sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c index f808c5fa9838..3f0b8e2ef3d4 100644 --- a/drivers/powercap/powercap_sys.c +++ b/drivers/powercap/powercap_sys.c @@ -367,9 +367,9 @@ static void create_power_zone_common_attributes( &dev_attr_max_energy_range_uj.attr; if (power_zone->ops->get_energy_uj) { if (power_zone->ops->reset_energy_uj) - dev_attr_energy_uj.attr.mode = S_IWUSR | S_IRUGO; + dev_attr_energy_uj.attr.mode = S_IWUSR | S_IRUSR; else - dev_attr_energy_uj.attr.mode = S_IRUGO; + dev_attr_energy_uj.attr.mode = S_IRUSR; power_zone->zone_dev_attrs[count++] = &dev_attr_energy_uj.attr; } -- cgit v1.2.3 From 3e9fa9983b9297407c2448114d6d27782d5e2ef2 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 30 Sep 2020 21:04:05 -0400 Subject: tools/power turbostat: update version number goodbye summer... Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 74d2fc6fc78a..f3a1746f7f45 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5712,7 +5712,7 @@ int get_and_dump_counters(void) } void print_version() { - fprintf(outf, "turbostat version 20.03.20" + fprintf(outf, "turbostat version 20.09.30" " - Len Brown \n"); } -- cgit v1.2.3 From 9a2a9ebc0a758d887ee06e067e9f7f0b36ff7574 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:25:57 +0100 Subject: cpufreq: Introduce governor flags A new cpufreq governor flag will be added subsequently, so replace the bool dynamic_switching fleid in struct cpufreq_governor with a flags field and introduce CPUFREQ_GOV_DYNAMIC_SWITCHING to set for the "dynamic switching" governors instead of it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 2 +- drivers/cpufreq/cpufreq_governor.h | 2 +- include/linux/cpufreq.h | 9 +++++++-- kernel/sched/cpufreq_schedutil.c | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 336b5e94cbc8..0252903f1b43 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2254,7 +2254,7 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy) return -EINVAL; /* Platform doesn't want dynamic frequency switching ? */ - if (policy->governor->dynamic_switching && + if (policy->governor->flags & CPUFREQ_GOV_DYNAMIC_SWITCHING && cpufreq_driver->flags & CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING) { struct cpufreq_governor *gov = cpufreq_fallback_governor(); diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index c56773c25757..bab8e6140377 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -156,7 +156,7 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy); #define CPUFREQ_DBS_GOVERNOR_INITIALIZER(_name_) \ { \ .name = _name_, \ - .dynamic_switching = true, \ + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, \ .owner = THIS_MODULE, \ .init = cpufreq_dbs_governor_init, \ .exit = cpufreq_dbs_governor_exit, \ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1eaa04f1bae6..9bdfcf3c4748 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -570,12 +570,17 @@ struct cpufreq_governor { char *buf); int (*store_setspeed) (struct cpufreq_policy *policy, unsigned int freq); - /* For governors which change frequency dynamically by themselves */ - bool dynamic_switching; struct list_head governor_list; struct module *owner; + u8 flags; }; +/* Governor flags */ + +/* For governors which change frequency dynamically by themselves */ +#define CPUFREQ_GOV_DYNAMIC_SWITCHING BIT(0) + + /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d73bccde2720..97d318b0cd0c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -881,7 +881,7 @@ static void sugov_limits(struct cpufreq_policy *policy) struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, - .dynamic_switching = true, + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, -- cgit v1.2.3 From 218f66870181bec7aaa6e3c72f346039c590c3c2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:26:10 +0100 Subject: cpufreq: Introduce CPUFREQ_GOV_STRICT_TARGET Introduce a new governor flag, CPUFREQ_GOV_STRICT_TARGET, for the governors that want the target frequency to be set exactly to the given value without leaving any room for adjustments on the hardware side and set this flag for the powersave and performance governors. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq_performance.c | 1 + drivers/cpufreq/cpufreq_powersave.c | 1 + include/linux/cpufreq.h | 3 +++ 3 files changed, 5 insertions(+) diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c index 71c1d9aba772..addd93f2a420 100644 --- a/drivers/cpufreq/cpufreq_performance.c +++ b/drivers/cpufreq/cpufreq_performance.c @@ -20,6 +20,7 @@ static void cpufreq_gov_performance_limits(struct cpufreq_policy *policy) static struct cpufreq_governor cpufreq_gov_performance = { .name = "performance", .owner = THIS_MODULE, + .flags = CPUFREQ_GOV_STRICT_TARGET, .limits = cpufreq_gov_performance_limits, }; diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c index 7749522355b5..8d830d860e91 100644 --- a/drivers/cpufreq/cpufreq_powersave.c +++ b/drivers/cpufreq/cpufreq_powersave.c @@ -21,6 +21,7 @@ static struct cpufreq_governor cpufreq_gov_powersave = { .name = "powersave", .limits = cpufreq_gov_powersave_limits, .owner = THIS_MODULE, + .flags = CPUFREQ_GOV_STRICT_TARGET, }; MODULE_AUTHOR("Dominik Brodowski "); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9bdfcf3c4748..6eb9a3b8ec7b 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -580,6 +580,9 @@ struct cpufreq_governor { /* For governors which change frequency dynamically by themselves */ #define CPUFREQ_GOV_DYNAMIC_SWITCHING BIT(0) +/* For governors wanting the target frequency to be set exactly */ +#define CPUFREQ_GOV_STRICT_TARGET BIT(1) + /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, -- cgit v1.2.3 From ea9364bbadf11f0c55802cf11387d74f524cee84 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:26:37 +0100 Subject: cpufreq: Add strict_target to struct cpufreq_policy Add a new field to be set when the CPUFREQ_GOV_STRICT_TARGET flag is set for the current governor to struct cpufreq_policy, so that the drivers needing to check CPUFREQ_GOV_STRICT_TARGET do not have to access the governor object during every frequency transition. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 2 ++ include/linux/cpufreq.h | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 0252903f1b43..1e7e3f2ff09f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2280,6 +2280,8 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy) } } + policy->strict_target = !!(policy->governor->flags & CPUFREQ_GOV_STRICT_TARGET); + return 0; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 6eb9a3b8ec7b..acbad3b36322 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -109,6 +109,12 @@ struct cpufreq_policy { bool fast_switch_possible; bool fast_switch_enabled; + /* + * Set if the CPUFREQ_GOV_STRICT_TARGET flag is set for the current + * governor. + */ + bool strict_target; + /* * Preferred average time interval between consecutive invocations of * the driver to set the frequency for this policy. To be set by the -- cgit v1.2.3 From fcb3a1ab79904d54499db77017793ccca665eb7e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:27:40 +0100 Subject: cpufreq: intel_pstate: Take CPUFREQ_GOV_STRICT_TARGET into account Make intel_pstate take the new CPUFREQ_GOV_STRICT_TARGET governor flag into account when it operates in the passive mode with HWP enabled, so as to fix the "powersave" governor behavior in that case (currently, HWP is allowed to scale the performance all the way up to the policy max limit when the "powersave" governor is used, but it should be constrained to the policy min limit then). Fixes: f6ebbcf08f37 ("cpufreq: intel_pstate: Implement passive mode with HWP enabled") Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Cc: 5.9+ # 5.9+: 9a2a9ebc0a75 cpufreq: Introduce governor flags Cc: 5.9+ # 5.9+: 218f66870181 cpufreq: Introduce CPUFREQ_GOV_STRICT_TARGET Cc: 5.9+ # 5.9+: ea9364bbadf1 cpufreq: Add strict_target to struct cpufreq_policy --- drivers/cpufreq/intel_pstate.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index b7a9779250aa..36a3ccfe6d3d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2527,7 +2527,7 @@ static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, in } static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 target_pstate, - bool fast_switch) + bool strict, bool fast_switch) { u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev; @@ -2539,7 +2539,7 @@ static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 target_pstate, * field in it, so opportunistically update the max too if needed. */ value &= ~HWP_MAX_PERF(~0L); - value |= HWP_MAX_PERF(cpu->max_perf_ratio); + value |= HWP_MAX_PERF(strict ? target_pstate : cpu->max_perf_ratio); if (value == prev) return; @@ -2562,14 +2562,16 @@ static void intel_cpufreq_adjust_perf_ctl(struct cpudata *cpu, pstate_funcs.get_val(cpu, target_pstate)); } -static int intel_cpufreq_update_pstate(struct cpudata *cpu, int target_pstate, - bool fast_switch) +static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy, + int target_pstate, bool fast_switch) { + struct cpudata *cpu = all_cpu_data[policy->cpu]; int old_pstate = cpu->pstate.current_pstate; target_pstate = intel_pstate_prepare_request(cpu, target_pstate); if (hwp_active) { - intel_cpufreq_adjust_hwp(cpu, target_pstate, fast_switch); + intel_cpufreq_adjust_hwp(cpu, target_pstate, + policy->strict_target, fast_switch); cpu->pstate.current_pstate = target_pstate; } else if (target_pstate != old_pstate) { intel_cpufreq_adjust_perf_ctl(cpu, target_pstate, fast_switch); @@ -2609,7 +2611,7 @@ static int intel_cpufreq_target(struct cpufreq_policy *policy, break; } - target_pstate = intel_cpufreq_update_pstate(cpu, target_pstate, false); + target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, false); freqs.new = target_pstate * cpu->pstate.scaling; @@ -2628,7 +2630,7 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); - target_pstate = intel_cpufreq_update_pstate(cpu, target_pstate, true); + target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, true); return target_pstate * cpu->pstate.scaling; } -- cgit v1.2.3 From 8d936bb13ce788c616084ab1a5754da3490a9f0c Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Tue, 10 Nov 2020 14:03:38 +0100 Subject: Documentation: ACPI: fix spelling mistakes Signed-off-by: Flavio Suligoi Signed-off-by: Rafael J. Wysocki --- Documentation/firmware-guide/acpi/acpi-lid.rst | 8 ++++---- Documentation/firmware-guide/acpi/method-tracing.rst | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/firmware-guide/acpi/acpi-lid.rst b/Documentation/firmware-guide/acpi/acpi-lid.rst index 874ce0ed340d..71b9af13a048 100644 --- a/Documentation/firmware-guide/acpi/acpi-lid.rst +++ b/Documentation/firmware-guide/acpi/acpi-lid.rst @@ -19,9 +19,9 @@ report the "current" state of the lid as either "opened" or "closed". For most platforms, both the _LID method and the lid notifications are reliable. However, there are exceptions. In order to work with these -exceptional buggy platforms, special restrictions and expections should be +exceptional buggy platforms, special restrictions and exceptions should be taken into account. This document describes the restrictions and the -expections of the Linux ACPI lid device driver. +exceptions of the Linux ACPI lid device driver. Restrictions of the returning value of the _LID control method @@ -46,7 +46,7 @@ state is changed to "closed". The "closed" notification is normally used to trigger some system power saving operations on Windows. Since it is fully tested, it is reliable from all AML tables. -Expections for the userspace users of the ACPI lid device driver +Exceptions for the userspace users of the ACPI lid device driver ================================================================ The ACPI button driver exports the lid state to the userspace via the @@ -100,7 +100,7 @@ use the following kernel parameter: C. button.lid_init_state=ignore: When this option is specified, the ACPI button driver never reports the initial lid state and there is a compensation mechanism implemented to - ensure that the reliable "closed" notifications can always be delievered + ensure that the reliable "closed" notifications can always be delivered to the userspace by always pairing "closed" input events with complement "opened" input events. But there is still no guarantee that the "opened" notifications can be delivered to the userspace when the lid is actually diff --git a/Documentation/firmware-guide/acpi/method-tracing.rst b/Documentation/firmware-guide/acpi/method-tracing.rst index 0aa7e2c5d32a..6ab6c0964042 100644 --- a/Documentation/firmware-guide/acpi/method-tracing.rst +++ b/Documentation/firmware-guide/acpi/method-tracing.rst @@ -98,7 +98,7 @@ subject to change:: [ 0.188903] exdebug-0398 ex_trace_point : Method End [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution. Developers can utilize these special log entries to track the AML -interpretion, thus can aid issue debugging and performance tuning. Note +interpretation, thus can aid issue debugging and performance tuning. Note that, as the "AML tracer" logs are implemented via ACPI_DEBUG_PRINT() macro, CONFIG_ACPI_DEBUG is also required to be enabled for enabling "AML tracer" logs. -- cgit v1.2.3 From 38748bcb940e8b52beee19b0e5cfd740475a99e1 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 10 Nov 2020 09:50:58 -0800 Subject: ACPI: DPTF: Support Alder Lake Add Alder Lake ACPI IDs for DPTF devices. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/acpi/dptf/dptf_pch_fivr.c | 1 + drivers/acpi/dptf/dptf_power.c | 2 ++ drivers/acpi/dptf/int340x_thermal.c | 6 ++++++ drivers/acpi/fan.c | 1 + 4 files changed, 10 insertions(+) diff --git a/drivers/acpi/dptf/dptf_pch_fivr.c b/drivers/acpi/dptf/dptf_pch_fivr.c index 4c1992fce150..5fca18296bf6 100644 --- a/drivers/acpi/dptf/dptf_pch_fivr.c +++ b/drivers/acpi/dptf/dptf_pch_fivr.c @@ -106,6 +106,7 @@ static int pch_fivr_remove(struct platform_device *pdev) static const struct acpi_device_id pch_fivr_device_ids[] = { {"INTC1045", 0}, + {"INTC1049", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, pch_fivr_device_ids); diff --git a/drivers/acpi/dptf/dptf_power.c b/drivers/acpi/dptf/dptf_power.c index 06741305fc77..a24d5d7aa117 100644 --- a/drivers/acpi/dptf/dptf_power.c +++ b/drivers/acpi/dptf/dptf_power.c @@ -229,6 +229,8 @@ static const struct acpi_device_id int3407_device_ids[] = { {"INT3532", 0}, {"INTC1047", 0}, {"INTC1050", 0}, + {"INTC1060", 0}, + {"INTC1061", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, int3407_device_ids); diff --git a/drivers/acpi/dptf/int340x_thermal.c b/drivers/acpi/dptf/int340x_thermal.c index 8d420c7e7178..d14025a85ce8 100644 --- a/drivers/acpi/dptf/int340x_thermal.c +++ b/drivers/acpi/dptf/int340x_thermal.c @@ -25,10 +25,16 @@ static const struct acpi_device_id int340x_thermal_device_ids[] = { {"INT340A"}, {"INT340B"}, {"INTC1040"}, + {"INTC1041"}, {"INTC1043"}, {"INTC1044"}, {"INTC1045"}, + {"INTC1046"}, {"INTC1047"}, + {"INTC1048"}, + {"INTC1049"}, + {"INTC1060"}, + {"INTC1061"}, {""}, }; diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index 62873388b24f..48354f82fba6 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -27,6 +27,7 @@ static const struct acpi_device_id fan_device_ids[] = { {"PNP0C0B", 0}, {"INT3404", 0}, {"INTC1044", 0}, + {"INTC1048", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, fan_device_ids); -- cgit v1.2.3 From 97adb13dc9ba08ecd4758bc59efc0205f5cbf377 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Sat, 7 Nov 2020 13:19:28 +0200 Subject: selftest: fix flower terse dump tests Iproute2 tc classifier terse dump has been accepted with modified syntax. Update the tests accordingly. Signed-off-by: Vlad Buslov Fixes: e7534fd42a99 ("selftests: implement flower classifier terse dump tests") Link: https://lore.kernel.org/r/20201107111928.453534-1-vlad@buslov.dev Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/tc-tests/filters/tests.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json index bb543bf69d69..361235ad574b 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json @@ -100,7 +100,7 @@ ], "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop", "expExitCode": "0", - "verifyCmd": "$TC filter show terse dev $DEV2 ingress", + "verifyCmd": "$TC -br filter show dev $DEV2 ingress", "matchPattern": "filter protocol ip pref 1 flower.*handle", "matchCount": "1", "teardown": [ @@ -119,7 +119,7 @@ ], "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop", "expExitCode": "0", - "verifyCmd": "$TC filter show terse dev $DEV2 ingress", + "verifyCmd": "$TC -br filter show dev $DEV2 ingress", "matchPattern": " dst_mac e4:11:22:11:4a:51", "matchCount": "0", "teardown": [ -- cgit v1.2.3 From 3a7001788fed0311d6fb77ed0dabe7bed3567bc0 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 14 Oct 2020 08:54:09 +0000 Subject: i40e: Fix MAC address setting for a VF via Host/VM Fix MAC setting flow for the PF driver. Update the unicast VF's MAC address in VF structure if it is a new setting in i40e_vc_add_mac_addr_msg. When unicast MAC address gets deleted, record that and set the new unicast MAC address that is already waiting in the filter list. This logic is based on the order of messages arriving to the PF driver. Without this change the MAC address setting was interpreted incorrectly in the following use cases: 1) Print incorrect VF MAC or zero MAC ip link show dev $pf 2) Don't preserve MAC between driver reload rmmod iavf; modprobe iavf 3) Update VF MAC when macvlan was set ip link add link $vf address $mac $vf.1 type macvlan 4) Failed to update mac address when VF was trusted ip link set dev $vf address $mac This includes all other configurations including above commands. Fixes: f657a6e1313b ("i40e: Fix VF driver MAC address configuration") Signed-off-by: Slawomir Laba Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 26 ++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index c96e2f2d4cba..4919d22d7b6b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2713,6 +2713,10 @@ static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg) spin_unlock_bh(&vsi->mac_filter_hash_lock); goto error_param; } + if (is_valid_ether_addr(al->list[i].addr) && + is_zero_ether_addr(vf->default_lan_addr.addr)) + ether_addr_copy(vf->default_lan_addr.addr, + al->list[i].addr); } } spin_unlock_bh(&vsi->mac_filter_hash_lock); @@ -2740,6 +2744,7 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg) { struct virtchnl_ether_addr_list *al = (struct virtchnl_ether_addr_list *)msg; + bool was_unimac_deleted = false; struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = NULL; i40e_status ret = 0; @@ -2759,6 +2764,8 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg) ret = I40E_ERR_INVALID_MAC_ADDR; goto error_param; } + if (ether_addr_equal(al->list[i].addr, vf->default_lan_addr.addr)) + was_unimac_deleted = true; } vsi = pf->vsi[vf->lan_vsi_idx]; @@ -2779,10 +2786,25 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg) dev_err(&pf->pdev->dev, "Unable to program VF %d MAC filters, error %d\n", vf->vf_id, ret); + if (vf->trusted && was_unimac_deleted) { + struct i40e_mac_filter *f; + struct hlist_node *h; + u8 *macaddr = NULL; + int bkt; + + /* set last unicast mac address as default */ + spin_lock_bh(&vsi->mac_filter_hash_lock); + hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) { + if (is_valid_ether_addr(f->macaddr)) + macaddr = f->macaddr; + } + if (macaddr) + ether_addr_copy(vf->default_lan_addr.addr, macaddr); + spin_unlock_bh(&vsi->mac_filter_hash_lock); + } error_param: /* send the response to the VF */ - return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DEL_ETH_ADDR, - ret); + return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DEL_ETH_ADDR, ret); } /** -- cgit v1.2.3 From 1773482fd8cecd5b060d409853f8145be3064a41 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 16 Sep 2020 17:32:28 +0300 Subject: i40e, xsk: uninitialized variable in i40e_clean_rx_irq_zc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "failure" variable is used without being initialized. It should be set to false. Fixes: 8cbf74149903 ("i40e, xsk: move buffer allocation out of the Rx processing loop") Signed-off-by: Dan Carpenter Acked-by: Björn Töpel Tested-by: George Kuruvinakunnel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 6acede0acdca..567fd67e900e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -281,8 +281,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); unsigned int xdp_res, xdp_xmit = 0; + bool failure = false; struct sk_buff *skb; - bool failure; while (likely(total_rx_packets < (unsigned int)budget)) { union i40e_rx_desc *rx_desc; -- cgit v1.2.3 From 6b7ed22ae4c96a415001f0c3116ebee15bb8491a Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Fri, 25 Sep 2020 11:35:37 -0700 Subject: igc: Fix returning wrong statistics 'igc_update_stats()' was not updating 'netdev->stats', so the returned statistics, for example, requested by: $ ip -s link show dev enp3s0 were not being updated and were always zero. Fix by returning a set of statistics that are actually being updated (adapter->stats64). Fixes: c9a11c23ceb6 ("igc: Add netdev") Signed-off-by: Vinicius Costa Gomes Tested-by: Aaron Brown Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 9112dff075cf..b673ac1199bb 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -3891,21 +3891,23 @@ static int igc_change_mtu(struct net_device *netdev, int new_mtu) } /** - * igc_get_stats - Get System Network Statistics + * igc_get_stats64 - Get System Network Statistics * @netdev: network interface device structure + * @stats: rtnl_link_stats64 pointer * * Returns the address of the device statistics structure. * The statistics are updated here and also from the timer callback. */ -static struct net_device_stats *igc_get_stats(struct net_device *netdev) +static void igc_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) { struct igc_adapter *adapter = netdev_priv(netdev); + spin_lock(&adapter->stats64_lock); if (!test_bit(__IGC_RESETTING, &adapter->state)) igc_update_stats(adapter); - - /* only return the current stats */ - return &netdev->stats; + memcpy(stats, &adapter->stats64, sizeof(*stats)); + spin_unlock(&adapter->stats64_lock); } static netdev_features_t igc_fix_features(struct net_device *netdev, @@ -4855,7 +4857,7 @@ static const struct net_device_ops igc_netdev_ops = { .ndo_set_rx_mode = igc_set_rx_mode, .ndo_set_mac_address = igc_set_mac, .ndo_change_mtu = igc_change_mtu, - .ndo_get_stats = igc_get_stats, + .ndo_get_stats64 = igc_get_stats64, .ndo_fix_features = igc_fix_features, .ndo_set_features = igc_set_features, .ndo_features_check = igc_features_check, -- cgit v1.2.3 From 5fb7f75bc138c868df2df40d386c7244122cca77 Mon Sep 17 00:00:00 2001 From: Tony Nguyen Date: Mon, 9 Nov 2020 16:07:35 -0800 Subject: MAINTAINERS: Update repositories for Intel Ethernet Drivers Update Intel Ethernet Drivers repositories to new locations. Signed-off-by: Tony Nguyen --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cd123d0a6a2d..9e826b55fcd9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8829,8 +8829,8 @@ S: Supported W: http://www.intel.com/support/feedback.htm W: http://e1000.sourceforge.net/ Q: http://patchwork.ozlabs.org/project/intel-wired-lan/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/net-queue.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue.git F: Documentation/networking/device_drivers/ethernet/intel/ F: drivers/net/ethernet/intel/ F: drivers/net/ethernet/intel/*/ -- cgit v1.2.3 From 866358ec331f8faa394995fb4b511af1db0247c8 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sun, 8 Nov 2020 09:08:26 -0500 Subject: netlabel: fix our progress tracking in netlbl_unlabel_staticlist() The current NetLabel code doesn't correctly keep track of the netlink dump state in some cases, in particular when multiple interfaces with large configurations are loaded. The problem manifests itself by not reporting the full configuration to userspace, even though it is loaded and active in the kernel. This patch fixes this by ensuring that the dump state is properly reset when necessary inside the netlbl_unlabel_staticlist() function. Fixes: 8cc44579d1bd ("NetLabel: Introduce static network labels for unlabeled connections") Signed-off-by: Paul Moore Link: https://lore.kernel.org/r/160484450633.3752.16512718263560813473.stgit@sifl Signed-off-by: Jakub Kicinski --- net/netlabel/netlabel_unlabeled.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 2e8e3f7b2111..fc55c9116da0 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1166,12 +1166,13 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, struct netlbl_unlhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; - u32 iter_bkt; - u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; + u32 skip_addr4 = cb->args[2]; + u32 iter_bkt, iter_chain, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; struct list_head *iter_list; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) + u32 skip_addr6 = cb->args[3]; struct netlbl_af6list *addr6; #endif @@ -1182,7 +1183,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, rcu_read_lock(); for (iter_bkt = skip_bkt; iter_bkt < rcu_dereference(netlbl_unlhsh)->size; - iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { + iter_bkt++) { iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; list_for_each_entry_rcu(iface, iter_list, list) { if (!iface->valid || @@ -1190,7 +1191,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, continue; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { - if (iter_addr4++ < cb->args[2]) + if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1203,10 +1204,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, goto unlabel_staticlist_return; } } + iter_addr4 = 0; + skip_addr4 = 0; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { - if (iter_addr6++ < cb->args[3]) + if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1219,8 +1222,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, goto unlabel_staticlist_return; } } + iter_addr6 = 0; + skip_addr6 = 0; #endif /* IPv6 */ } + iter_chain = 0; + skip_chain = 0; } unlabel_staticlist_return: -- cgit v1.2.3 From 902a66e08ceaadb9a7a1ab3a4f3af611cd1d8cba Mon Sep 17 00:00:00 2001 From: Sven Van Asbroeck Date: Sun, 8 Nov 2020 12:12:24 -0500 Subject: lan743x: correctly handle chips with internal PHY Commit 6f197fb63850 ("lan743x: Added fixed link and RGMII support") assumes that chips with an internal PHY will never have a devicetree entry. This is incorrect: even for these chips, a devicetree entry can be useful e.g. to pass the mac address from bootloader to chip: &pcie { status = "okay"; host@0 { reg = <0 0 0 0 0>; #address-cells = <3>; #size-cells = <2>; lan7430: ethernet@0 { /* LAN7430 with internal PHY */ compatible = "microchip,lan743x"; status = "okay"; reg = <0 0 0 0 0>; /* filled in by bootloader */ local-mac-address = [00 00 00 00 00 00]; }; }; }; If a devicetree entry is present, the driver will not attach the chip to its internal phy, and the chip will be non-operational. Fix by tweaking the phy connection algorithm: - first try to connect to a phy specified in the devicetree (could be 'real' phy, or just a 'fixed-link') - if that doesn't succeed, try to connect to an internal phy, even if the chip has a devnode Tested on a LAN7430 with internal PHY. I cannot test a device using fixed-link, as I do not have access to one. Fixes: 6f197fb63850 ("lan743x: Added fixed link and RGMII support") Tested-by: Sven Van Asbroeck # lan7430 Reviewed-by: Andrew Lunn Signed-off-by: Sven Van Asbroeck Link: https://lore.kernel.org/r/20201108171224.23829-1-TheSven73@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index a1938842f828..bd77877cf1cc 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1026,9 +1026,9 @@ static int lan743x_phy_open(struct lan743x_adapter *adapter) netdev = adapter->netdev; phynode = of_node_get(adapter->pdev->dev.of_node); - adapter->phy_mode = PHY_INTERFACE_MODE_GMII; if (phynode) { + /* try devicetree phy, or fixed link */ of_get_phy_mode(phynode, &adapter->phy_mode); if (of_phy_is_fixed_link(phynode)) { @@ -1044,13 +1044,15 @@ static int lan743x_phy_open(struct lan743x_adapter *adapter) lan743x_phy_link_status_change, 0, adapter->phy_mode); of_node_put(phynode); - if (!phydev) - goto return_error; - } else { + } + + if (!phydev) { + /* try internal phy */ phydev = phy_find_first(adapter->mdiobus); if (!phydev) goto return_error; + adapter->phy_mode = PHY_INTERFACE_MODE_GMII; ret = phy_connect_direct(netdev, phydev, lan743x_phy_link_status_change, adapter->phy_mode); -- cgit v1.2.3 From f3037c5a31b58a73b32a36e938ad0560085acadd Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 8 Nov 2020 22:44:02 +0100 Subject: net: phy: realtek: support paged operations on RTL8201CP The RTL8401-internal PHY identifies as RTL8201CP, and the init sequence in r8169, copied from vendor driver r8168, uses paged operations. Therefore set the same paged operation callbacks as for the other Realtek PHY's. Fixes: cdafdc29ef75 ("r8169: sync support for RTL8401 with vendor driver") Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/69882f7a-ca2f-e0c7-ae83-c9b6937282cd@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index fb1db713b7fb..575580d3ffe0 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -551,6 +551,8 @@ static struct phy_driver realtek_drvs[] = { { PHY_ID_MATCH_EXACT(0x00008201), .name = "RTL8201CP Ethernet", + .read_page = rtl821x_read_page, + .write_page = rtl821x_write_page, }, { PHY_ID_MATCH_EXACT(0x001cc816), .name = "RTL8201F Fast Ethernet", -- cgit v1.2.3 From 909172a149749242990a6e64cb55d55460d4e417 Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Tue, 10 Nov 2020 08:16:31 +0800 Subject: net: Update window_clamp if SOCK_RCVBUF is set When net.ipv4.tcp_syncookies=1 and syn flood is happened, cookie_v4_check or cookie_v6_check tries to redo what tcp_v4_send_synack or tcp_v6_send_synack did, rsk_window_clamp will be changed if SOCK_RCVBUF is set, which will make rcv_wscale is different, the client still operates with initial window scale and can overshot granted window, the client use the initial scale but local server use new scale to advertise window value, and session work abnormally. Fixes: e88c64f0a425 ("tcp: allow effective reduction of TCP's rcv-buffer via setsockopt") Signed-off-by: Mao Wenan Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/1604967391-123737-1-git-send-email-wenan.mao@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/ipv4/syncookies.c | 9 +++++++-- net/ipv6/syncookies.c | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 6ac473b47f30..00dc3f943c80 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -331,7 +331,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct rtable *rt; __u8 rcv_wscale; struct flowi4 fl4; @@ -427,8 +427,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; - tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index e796a64be308..9b6cae1e49d9 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -136,7 +136,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct dst_entry *dst; __u8 rcv_wscale; u32 tsoff = 0; @@ -241,7 +241,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) } req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); - tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; + + tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); -- cgit v1.2.3 From 2bae900b9419db3f3e43bbda3194657235fee096 Mon Sep 17 00:00:00 2001 From: zhangxiaoxu Date: Mon, 9 Nov 2020 09:44:16 -0500 Subject: net: dsa: mv88e6xxx: Fix memleak in mv88e6xxx_region_atu_snapshot When mv88e6xxx_fid_map return error, we lost free the table. Fix it. Fixes: bfb255428966 ("net: dsa: mv88e6xxx: Add devlink regions") Reported-by: Hulk Robot Signed-off-by: zhangxiaoxu Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20201109144416.1540867-1-zhangxiaoxu5@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/devlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/devlink.c b/drivers/net/dsa/mv88e6xxx/devlink.c index 10cd1bfd81a0..ade04c036fd9 100644 --- a/drivers/net/dsa/mv88e6xxx/devlink.c +++ b/drivers/net/dsa/mv88e6xxx/devlink.c @@ -393,8 +393,10 @@ static int mv88e6xxx_region_atu_snapshot(struct devlink *dl, mv88e6xxx_reg_lock(chip); err = mv88e6xxx_fid_map(chip, fid_bitmap); - if (err) + if (err) { + kfree(table); goto out; + } while (1) { fid = find_next_bit(fid_bitmap, MV88E6XXX_N_FID, fid + 1); -- cgit v1.2.3 From 2b52a4b65bc8f14520fe6e996ea7fb3f7e400761 Mon Sep 17 00:00:00 2001 From: Sven Van Asbroeck Date: Mon, 9 Nov 2020 15:38:28 -0500 Subject: lan743x: fix "BUG: invalid wait context" when setting rx mode In the net core, the struct net_device_ops -> ndo_set_rx_mode() callback is called with the dev->addr_list_lock spinlock held. However, this driver's ndo_set_rx_mode callback eventually calls lan743x_dp_write(), which acquires a mutex. Mutex acquisition may sleep, and this is not allowed when holding a spinlock. Fix by removing the dp_lock mutex entirely. Its purpose is to prevent concurrent accesses to the data port. No concurrent accesses are possible, because the dev->addr_list_lock spinlock in the core only lets through one thread at a time. Fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver") Signed-off-by: Sven Van Asbroeck Link: https://lore.kernel.org/r/20201109203828.5115-1-TheSven73@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_main.c | 12 +++--------- drivers/net/ethernet/microchip/lan743x_main.h | 3 --- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index bd77877cf1cc..173158656559 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -674,14 +674,12 @@ clean_up: static int lan743x_dp_write(struct lan743x_adapter *adapter, u32 select, u32 addr, u32 length, u32 *buf) { - int ret = -EIO; u32 dp_sel; int i; - mutex_lock(&adapter->dp_lock); if (lan743x_csr_wait_for_bit(adapter, DP_SEL, DP_SEL_DPRDY_, 1, 40, 100, 100)) - goto unlock; + return -EIO; dp_sel = lan743x_csr_read(adapter, DP_SEL); dp_sel &= ~DP_SEL_MASK_; dp_sel |= select; @@ -693,13 +691,10 @@ static int lan743x_dp_write(struct lan743x_adapter *adapter, lan743x_csr_write(adapter, DP_CMD, DP_CMD_WRITE_); if (lan743x_csr_wait_for_bit(adapter, DP_SEL, DP_SEL_DPRDY_, 1, 40, 100, 100)) - goto unlock; + return -EIO; } - ret = 0; -unlock: - mutex_unlock(&adapter->dp_lock); - return ret; + return 0; } static u32 lan743x_mac_mii_access(u16 id, u16 index, int read) @@ -2735,7 +2730,6 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter, adapter->intr.irq = adapter->pdev->irq; lan743x_csr_write(adapter, INT_EN_CLR, 0xFFFFFFFF); - mutex_init(&adapter->dp_lock); ret = lan743x_gpio_init(adapter); if (ret) diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index c61a40411317..a536f4a4994d 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -712,9 +712,6 @@ struct lan743x_adapter { struct lan743x_csr csr; struct lan743x_intr intr; - /* lock, used to prevent concurrent access to data port */ - struct mutex dp_lock; - struct lan743x_gpio gpio; struct lan743x_ptp ptp; -- cgit v1.2.3 From 4031eeafa71eaf22ae40a15606a134ae86345daf Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 9 Nov 2020 08:57:05 +0100 Subject: net/af_iucv: fix null pointer dereference on shutdown syzbot reported the following KASAN finding: BUG: KASAN: nullptr-dereference in iucv_send_ctrl+0x390/0x3f0 net/iucv/af_iucv.c:385 Read of size 2 at addr 000000000000021e by task syz-executor907/519 CPU: 0 PID: 519 Comm: syz-executor907 Not tainted 5.9.0-syzkaller-07043-gbcf9877ad213 #0 Hardware name: IBM 3906 M04 701 (KVM/Linux) Call Trace: [<00000000c576af60>] unwind_start arch/s390/include/asm/unwind.h:65 [inline] [<00000000c576af60>] show_stack+0x180/0x228 arch/s390/kernel/dumpstack.c:135 [<00000000c9dcd1f8>] __dump_stack lib/dump_stack.c:77 [inline] [<00000000c9dcd1f8>] dump_stack+0x268/0x2f0 lib/dump_stack.c:118 [<00000000c5fed016>] print_address_description.constprop.0+0x5e/0x218 mm/kasan/report.c:383 [<00000000c5fec82a>] __kasan_report mm/kasan/report.c:517 [inline] [<00000000c5fec82a>] kasan_report+0x11a/0x168 mm/kasan/report.c:534 [<00000000c98b5b60>] iucv_send_ctrl+0x390/0x3f0 net/iucv/af_iucv.c:385 [<00000000c98b6262>] iucv_sock_shutdown+0x44a/0x4c0 net/iucv/af_iucv.c:1457 [<00000000c89d3a54>] __sys_shutdown+0x12c/0x1c8 net/socket.c:2204 [<00000000c89d3b70>] __do_sys_shutdown net/socket.c:2212 [inline] [<00000000c89d3b70>] __s390x_sys_shutdown+0x38/0x48 net/socket.c:2210 [<00000000c9e36eac>] system_call+0xe0/0x28c arch/s390/kernel/entry.S:415 There is nothing to shutdown if a connection has never been established. Besides that iucv->hs_dev is not yet initialized if a socket is in IUCV_OPEN state and iucv->path is not yet initialized if socket is in IUCV_BOUND state. So, just skip the shutdown calls for a socket in these states. Fixes: eac3731bd04c ("[S390]: Add AF_IUCV socket support") Fixes: 82492a355fac ("af_iucv: add shutdown for HS transport") Reviewed-by: Vasily Gorbik Signed-off-by: Ursula Braun [jwi: correct one Fixes tag] Signed-off-by: Julian Wiedmann Signed-off-by: Jakub Kicinski --- net/iucv/af_iucv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index d80572074667..047238f01ba6 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1434,7 +1434,8 @@ static int iucv_sock_shutdown(struct socket *sock, int how) break; } - if (how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) { + if ((how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) && + sk->sk_state == IUCV_CONNECTED) { if (iucv->transport == AF_IUCV_TRANS_IUCV) { txmsg.class = 0; txmsg.tag = 0; -- cgit v1.2.3 From 4711497ae85d90de903671989daf5145054c123e Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 9 Nov 2020 08:57:06 +0100 Subject: MAINTAINERS: remove Ursula Braun as s390 network maintainer I am retiring soon. Thus this patch removes myself from the MAINTAINERS file (s390 network). Signed-off-by: Ursula Braun [jwi: fix up the subject] Signed-off-by: Julian Wiedmann Signed-off-by: Jakub Kicinski --- MAINTAINERS | 3 --- 1 file changed, 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cd123d0a6a2d..fa11fe773df5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15246,7 +15246,6 @@ F: drivers/iommu/s390-iommu.c S390 IUCV NETWORK LAYER M: Julian Wiedmann M: Karsten Graul -M: Ursula Braun L: linux-s390@vger.kernel.org S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ @@ -15257,7 +15256,6 @@ F: net/iucv/ S390 NETWORK DRIVERS M: Julian Wiedmann M: Karsten Graul -M: Ursula Braun L: linux-s390@vger.kernel.org S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ @@ -15828,7 +15826,6 @@ S: Maintained F: drivers/misc/sgi-xp/ SHARED MEMORY COMMUNICATIONS (SMC) SOCKETS -M: Ursula Braun M: Karsten Graul L: linux-s390@vger.kernel.org S: Supported -- cgit v1.2.3 From fa6882c63621821f73cc806f291208e1c6ea6187 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 9 Nov 2020 22:09:13 +0800 Subject: tipc: fix memory leak in tipc_topsrv_start() kmemleak report a memory leak as follows: unreferenced object 0xffff88810a596800 (size 512): comm "ip", pid 21558, jiffies 4297568990 (age 112.120s) hex dump (first 32 bytes): 00 00 00 00 ad 4e ad de ff ff ff ff 00 00 00 00 .....N.......... ff ff ff ff ff ff ff ff 00 83 60 b0 ff ff ff ff ..........`..... backtrace: [<0000000022bbe21f>] tipc_topsrv_init_net+0x1f3/0xa70 [<00000000fe15ddf7>] ops_init+0xa8/0x3c0 [<00000000138af6f2>] setup_net+0x2de/0x7e0 [<000000008c6807a3>] copy_net_ns+0x27d/0x530 [<000000006b21adbd>] create_new_namespaces+0x382/0xa30 [<00000000bb169746>] unshare_nsproxy_namespaces+0xa1/0x1d0 [<00000000fe2e42bc>] ksys_unshare+0x39c/0x780 [<0000000009ba3b19>] __x64_sys_unshare+0x2d/0x40 [<00000000614ad866>] do_syscall_64+0x56/0xa0 [<00000000a1b5ca3c>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 'srv' is malloced in tipc_topsrv_start() but not free before leaving from the error handling cases. We need to free it. Fixes: 5c45ab24ac77 ("tipc: make struct tipc_server private for server.c") Reported-by: Hulk Robot Signed-off-by: Wang Hai Link: https://lore.kernel.org/r/20201109140913.47370-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski --- net/tipc/topsrv.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 5f6f86051c83..13f3143609f9 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -664,12 +664,18 @@ static int tipc_topsrv_start(struct net *net) ret = tipc_topsrv_work_start(srv); if (ret < 0) - return ret; + goto err_start; ret = tipc_topsrv_create_listener(srv); if (ret < 0) - tipc_topsrv_work_stop(srv); + goto err_create; + return 0; + +err_create: + tipc_topsrv_work_stop(srv); +err_start: + kfree(srv); return ret; } -- cgit v1.2.3 From df392aefe96b9f94efb01ef298b617bab346a9be Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 9 Nov 2020 12:04:36 +0100 Subject: arm64: dts: fsl-ls1028a-kontron-sl28: specify in-band mode for ENETC Since commit 71b77a7a27a3 ("enetc: Migrate to PHYLINK and PCS_LYNX") the network port of the Kontron sl28 board is broken. After the migration to phylink the device tree has to specify the in-band-mode property. Add it. Fixes: 71b77a7a27a3 ("enetc: Migrate to PHYLINK and PCS_LYNX") Suggested-by: Vladimir Oltean Signed-off-by: Michael Walle Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20201109110436.5906-1-michael@walle.cc Signed-off-by: Jakub Kicinski --- arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts index f46eb47cfa4d..8161dd237971 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts @@ -75,6 +75,7 @@ &enetc_port0 { phy-handle = <&phy0>; phy-connection-type = "sgmii"; + managed = "in-band-status"; status = "okay"; mdio { -- cgit v1.2.3 From 361182308766a265b6c521879b34302617a8c209 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Mon, 9 Nov 2020 07:54:49 +0100 Subject: net/x25: Fix null-ptr-deref in x25_connect This fixes a regression for blocking connects introduced by commit 4becb7ee5b3d ("net/x25: Fix x25_neigh refcnt leak when x25 disconnect"). The x25->neighbour is already set to "NULL" by x25_disconnect() now, while a blocking connect is waiting in x25_wait_for_connection_establishment(). Therefore x25->neighbour must not be accessed here again and x25->state is also already set to X25_STATE_0 by x25_disconnect(). Fixes: 4becb7ee5b3d ("net/x25: Fix x25_neigh refcnt leak when x25 disconnect") Signed-off-by: Martin Schiller Reviewed-by: Xie He Link: https://lore.kernel.org/r/20201109065449.9014-1-ms@dev.tdt.de Signed-off-by: Jakub Kicinski --- net/x25/af_x25.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 0bbb283f23c9..046d3fee66a9 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -825,7 +825,7 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTED; rc = 0; out_put_neigh: - if (rc) { + if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; -- cgit v1.2.3 From 9d2e5e9eeb59524a59b461fe256139826d464e1e Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:31 +0530 Subject: cxgb4/ch_ktls: decrypted bit is not enough If skb has retransmit data starting before start marker, e.g. ccs, decrypted bit won't be set for that, and if it has some data to encrypt, then it must be given to crypto ULD. So in place of decrypted, check if socket is tls offloaded. Also, unless skb has some data to encrypt, no need to give it for tls offload handling. v2->v3: - Removed ifdef. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 1 + drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 5 +++++ drivers/net/ethernet/chelsio/cxgb4/sge.c | 3 ++- drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 4 ---- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index a952fe198eb9..7fd264a6d085 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -1176,6 +1176,7 @@ static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb, txq = netdev_pick_tx(dev, skb, sb_dev); if (xfrm_offload(skb) || is_ptp_enabled(skb, dev) || skb->encapsulation || + cxgb4_is_ktls_skb(skb) || (proto != IPPROTO_TCP && proto != IPPROTO_UDP)) txq = txq % pi->nqsets; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index b169776ab484..e2a4941fa802 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -493,6 +493,11 @@ struct cxgb4_uld_info { #endif }; +static inline bool cxgb4_is_ktls_skb(struct sk_buff *skb) +{ + return skb->sk && tls_is_sk_tx_device_offloaded(skb->sk); +} + void cxgb4_uld_enable(struct adapter *adap); void cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p); int cxgb4_unregister_uld(enum cxgb4_uld type); diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index a9e9c7ae565d..01bd9c0dfe4e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -1422,7 +1422,8 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev) #endif /* CHELSIO_IPSEC_INLINE */ #if IS_ENABLED(CONFIG_CHELSIO_TLS_DEVICE) - if (skb->decrypted) + if (cxgb4_is_ktls_skb(skb) && + (skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb)))) return adap->uld[CXGB4_ULD_KTLS].tx_handler(skb, dev); #endif /* CHELSIO_TLS_DEVICE */ diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 5195f692f14d..43c723c72c61 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1878,10 +1878,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) mss = skb_is_gso(skb) ? skb_shinfo(skb)->gso_size : skb->data_len; - /* check if we haven't set it for ktls offload */ - if (!skb->sk || !tls_is_sk_tx_device_offloaded(skb->sk)) - goto out; - tls_ctx = tls_get_ctx(skb->sk); if (unlikely(tls_ctx->netdev != dev)) goto out; -- cgit v1.2.3 From b1b5cb18032b37ab69b23a461eb8be1a44fcfc3b Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:32 +0530 Subject: ch_ktls: Correction in finding correct length There is a possibility of linear skbs coming in. Correcting the length extraction logic. v2->v3: - Separated un-related changes from this patch. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 43c723c72c61..447aec7ae954 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -967,7 +967,7 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, /* packet length = eth hdr len + ip hdr len + tcp hdr len * (including options). */ - pktlen = skb->len - skb->data_len; + pktlen = skb_transport_offset(skb) + tcp_hdrlen(skb); ctrl = sizeof(*cpl) + pktlen; len16 = DIV_ROUND_UP(sizeof(*wr) + ctrl, 16); @@ -1860,6 +1860,7 @@ out: /* nic tls TX handler */ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) { + u32 tls_end_offset, tcp_seq, skb_data_len, skb_offset; struct ch_ktls_port_stats_debug *port_stats; struct chcr_ktls_ofld_ctx_tx *tx_ctx; struct ch_ktls_stats_debug *stats; @@ -1867,7 +1868,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) int data_len, qidx, ret = 0, mss; struct tls_record_info *record; struct chcr_ktls_info *tx_info; - u32 tls_end_offset, tcp_seq; struct tls_context *tls_ctx; struct sk_buff *local_skb; struct sge_eth_txq *q; @@ -1875,8 +1875,11 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) unsigned long flags; tcp_seq = ntohl(th->seq); + skb_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); + skb_data_len = skb->len - skb_offset; + data_len = skb_data_len; - mss = skb_is_gso(skb) ? skb_shinfo(skb)->gso_size : skb->data_len; + mss = skb_is_gso(skb) ? skb_shinfo(skb)->gso_size : data_len; tls_ctx = tls_get_ctx(skb->sk); if (unlikely(tls_ctx->netdev != dev)) @@ -1922,8 +1925,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) /* copy skb contents into local skb */ chcr_ktls_skb_copy(skb, local_skb); - /* go through the skb and send only one record at a time. */ - data_len = skb->data_len; /* TCP segments can be in received either complete or partial. * chcr_end_part_handler will handle cases if complete record or end * part of the record is received. Incase of partial end part of record, @@ -2020,9 +2021,9 @@ clear_ref: } while (data_len > 0); - tx_info->prev_seq = ntohl(th->seq) + skb->data_len; + tx_info->prev_seq = ntohl(th->seq) + skb_data_len; atomic64_inc(&port_stats->ktls_tx_encrypted_packets); - atomic64_add(skb->data_len, &port_stats->ktls_tx_encrypted_bytes); + atomic64_add(skb_data_len, &port_stats->ktls_tx_encrypted_bytes); /* tcp finish is set, send a separate tcp msg including all the options * as well. -- cgit v1.2.3 From 86716b51d14fc2201938939b323ba3ad99186910 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:33 +0530 Subject: ch_ktls: Update cheksum information Checksum update was missing in the WR. Fixes: 429765a149f1 ("chcr: handle partial end part of a record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 447aec7ae954..b7a3e757ee72 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -959,6 +959,7 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, struct iphdr *ip; int credits; u8 buf[150]; + u64 cntrl1; void *pos; iplen = skb_network_header_len(skb); @@ -997,22 +998,28 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, TXPKT_PF_V(tx_info->adap->pf)); cpl->pack = 0; cpl->len = htons(pktlen); - /* checksum offload */ - cpl->ctrl1 = 0; - - pos = cpl + 1; memcpy(buf, skb->data, pktlen); if (tx_info->ip_family == AF_INET) { /* we need to correct ip header len */ ip = (struct iphdr *)(buf + maclen); ip->tot_len = htons(pktlen - maclen); + cntrl1 = TXPKT_CSUM_TYPE_V(TX_CSUM_TCPIP); #if IS_ENABLED(CONFIG_IPV6) } else { ip6 = (struct ipv6hdr *)(buf + maclen); ip6->payload_len = htons(pktlen - maclen - iplen); + cntrl1 = TXPKT_CSUM_TYPE_V(TX_CSUM_TCPIP6); #endif } + + cntrl1 |= T6_TXPKT_ETHHDR_LEN_V(maclen - ETH_HLEN) | + TXPKT_IPHDR_LEN_V(iplen); + /* checksum offload */ + cpl->ctrl1 = cpu_to_be64(cntrl1); + + pos = cpl + 1; + /* now take care of the tcp header, if fin is not set then clear push * bit as well, and if fin is set, it will be sent at the last so we * need to update the tcp sequence number as per the last packet. -- cgit v1.2.3 From 687823d2d104df8226eacba74fda9f4ba3aecd6c Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:34 +0530 Subject: cxgb4/ch_ktls: creating skbs causes panic Creating SKB per tls record and freeing the original one causes panic. There will be race if connection reset is requested. By freeing original skb, refcnt will be decremented and that means, there is no pending record to send, and so tls_dev_del will be requested in control path while SKB of related connection is in queue. Better approach is to use same SKB to send one record (partial data) at a time. We still have to create a new SKB when partial last part of a record is requested. This fix introduces new API cxgb4_write_partial_sgl() to send partial part of skb. Present cxgb4_write_sgl can only provide feasibility to start from an offset which limits to header only and it can write sgls for the whole skb len. But this new API will help in both. It can start from any offset and can end writing in middle of the skb. v4->v5: - Removed extra changes. Fixes: 429765a149f1 ("chcr: handle partial end part of a record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 3 + drivers/net/ethernet/chelsio/cxgb4/sge.c | 108 ++++++++ .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 284 +++++++++------------ 3 files changed, 226 insertions(+), 169 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 3352dad6ca99..27308600da15 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -2124,6 +2124,9 @@ void cxgb4_inline_tx_skb(const struct sk_buff *skb, const struct sge_txq *q, void cxgb4_write_sgl(const struct sk_buff *skb, struct sge_txq *q, struct ulptx_sgl *sgl, u64 *end, unsigned int start, const dma_addr_t *addr); +void cxgb4_write_partial_sgl(const struct sk_buff *skb, struct sge_txq *q, + struct ulptx_sgl *sgl, u64 *end, + const dma_addr_t *addr, u32 start, u32 send_len); void cxgb4_ring_tx_db(struct adapter *adap, struct sge_txq *q, int n); int t4_set_vlan_acl(struct adapter *adap, unsigned int mbox, unsigned int vf, u16 vlan); diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 01bd9c0dfe4e..196652a114c5 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -890,6 +890,114 @@ void cxgb4_write_sgl(const struct sk_buff *skb, struct sge_txq *q, } EXPORT_SYMBOL(cxgb4_write_sgl); +/* cxgb4_write_partial_sgl - populate SGL for partial packet + * @skb: the packet + * @q: the Tx queue we are writing into + * @sgl: starting location for writing the SGL + * @end: points right after the end of the SGL + * @addr: the list of bus addresses for the SGL elements + * @start: start offset in the SKB where partial data starts + * @len: length of data from @start to send out + * + * This API will handle sending out partial data of a skb if required. + * Unlike cxgb4_write_sgl, @start can be any offset into the skb data, + * and @len will decide how much data after @start offset to send out. + */ +void cxgb4_write_partial_sgl(const struct sk_buff *skb, struct sge_txq *q, + struct ulptx_sgl *sgl, u64 *end, + const dma_addr_t *addr, u32 start, u32 len) +{ + struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1] = {0}, *to; + u32 frag_size, skb_linear_data_len = skb_headlen(skb); + struct skb_shared_info *si = skb_shinfo(skb); + u8 i = 0, frag_idx = 0, nfrags = 0; + skb_frag_t *frag; + + /* Fill the first SGL either from linear data or from partial + * frag based on @start. + */ + if (unlikely(start < skb_linear_data_len)) { + frag_size = min(len, skb_linear_data_len - start); + sgl->len0 = htonl(frag_size); + sgl->addr0 = cpu_to_be64(addr[0] + start); + len -= frag_size; + nfrags++; + } else { + start -= skb_linear_data_len; + frag = &si->frags[frag_idx]; + frag_size = skb_frag_size(frag); + /* find the first frag */ + while (start >= frag_size) { + start -= frag_size; + frag_idx++; + frag = &si->frags[frag_idx]; + frag_size = skb_frag_size(frag); + } + + frag_size = min(len, skb_frag_size(frag) - start); + sgl->len0 = cpu_to_be32(frag_size); + sgl->addr0 = cpu_to_be64(addr[frag_idx + 1] + start); + len -= frag_size; + nfrags++; + frag_idx++; + } + + /* If the entire partial data fit in one SGL, then send it out + * now. + */ + if (!len) + goto done; + + /* Most of the complexity below deals with the possibility we hit the + * end of the queue in the middle of writing the SGL. For this case + * only we create the SGL in a temporary buffer and then copy it. + */ + to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge; + + /* If the skb couldn't fit in first SGL completely, fill the + * rest of the frags in subsequent SGLs. Note that each SGL + * pair can store 2 frags. + */ + while (len) { + frag_size = min(len, skb_frag_size(&si->frags[frag_idx])); + to->len[i & 1] = cpu_to_be32(frag_size); + to->addr[i & 1] = cpu_to_be64(addr[frag_idx + 1]); + if (i && (i & 1)) + to++; + nfrags++; + frag_idx++; + i++; + len -= frag_size; + } + + /* If we ended in an odd boundary, then set the second SGL's + * length in the pair to 0. + */ + if (i & 1) + to->len[1] = cpu_to_be32(0); + + /* Copy from temporary buffer to Tx ring, in case we hit the + * end of the queue in the middle of writing the SGL. + */ + if (unlikely((u8 *)end > (u8 *)q->stat)) { + u32 part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1; + + if (likely(part0)) + memcpy(sgl->sge, buf, part0); + part1 = (u8 *)end - (u8 *)q->stat; + memcpy(q->desc, (u8 *)buf + part0, part1); + end = (void *)q->desc + part1; + } + + /* 0-pad to multiple of 16 */ + if ((uintptr_t)end & 8) + *end = 0; +done: + sgl->cmd_nsge = htonl(ULPTX_CMD_V(ULP_TX_SC_DSGL) | + ULPTX_NSGE_V(nfrags)); +} +EXPORT_SYMBOL(cxgb4_write_partial_sgl); + /* This function copies 64 byte coalesced work request to * memory mapped BAR2 space. For coalesced WR SGE fetches * data from the FIFO instead of from Host. diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index b7a3e757ee72..950841988ffe 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -14,6 +14,50 @@ static LIST_HEAD(uld_ctx_list); static DEFINE_MUTEX(dev_mutex); +/* chcr_get_nfrags_to_send: get the remaining nfrags after start offset + * @skb: skb + * @start: start offset. + * @len: how much data to send after @start + */ +static int chcr_get_nfrags_to_send(struct sk_buff *skb, u32 start, u32 len) +{ + struct skb_shared_info *si = skb_shinfo(skb); + u32 frag_size, skb_linear_data_len = skb_headlen(skb); + u8 nfrags = 0, frag_idx = 0; + skb_frag_t *frag; + + /* if its a linear skb then return 1 */ + if (!skb_is_nonlinear(skb)) + return 1; + + if (unlikely(start < skb_linear_data_len)) { + frag_size = min(len, skb_linear_data_len - start); + start = 0; + } else { + start -= skb_linear_data_len; + + frag = &si->frags[frag_idx]; + frag_size = skb_frag_size(frag); + while (start >= frag_size) { + start -= frag_size; + frag_idx++; + frag = &si->frags[frag_idx]; + frag_size = skb_frag_size(frag); + } + frag_size = min(len, skb_frag_size(frag) - start); + } + len -= frag_size; + nfrags++; + + while (len) { + frag_size = min(len, skb_frag_size(&si->frags[frag_idx])); + len -= frag_size; + nfrags++; + frag_idx++; + } + return nfrags; +} + static int chcr_init_tcb_fields(struct chcr_ktls_info *tx_info); /* * chcr_ktls_save_keys: calculate and save crypto keys. @@ -865,35 +909,15 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, return 0; } -/* - * chcr_ktls_skb_copy - * @nskb - new skb where the frags to be added. - * @skb - old skb from which frags will be copied. - */ -static void chcr_ktls_skb_copy(struct sk_buff *skb, struct sk_buff *nskb) -{ - int i; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_shinfo(nskb)->frags[i] = skb_shinfo(skb)->frags[i]; - __skb_frag_ref(&skb_shinfo(nskb)->frags[i]); - } - - skb_shinfo(nskb)->nr_frags = skb_shinfo(skb)->nr_frags; - nskb->len += skb->data_len; - nskb->data_len = skb->data_len; - nskb->truesize += skb->data_len; -} - /* * chcr_ktls_get_tx_flits * returns number of flits to be sent out, it includes key context length, WR * size and skb fragments. */ static unsigned int -chcr_ktls_get_tx_flits(const struct sk_buff *skb, unsigned int key_ctx_len) +chcr_ktls_get_tx_flits(u32 nr_frags, unsigned int key_ctx_len) { - return chcr_sgl_len(skb_shinfo(skb)->nr_frags) + + return chcr_sgl_len(nr_frags) + DIV_ROUND_UP(key_ctx_len + CHCR_KTLS_WR_SIZE, 8); } @@ -1038,71 +1062,6 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, return 0; } -/* chcr_ktls_skb_shift - Shifts request length paged data from skb to another. - * @tgt- buffer into which tail data gets added - * @skb- buffer from which the paged data comes from - * @shiftlen- shift up to this many bytes - */ -static int chcr_ktls_skb_shift(struct sk_buff *tgt, struct sk_buff *skb, - int shiftlen) -{ - skb_frag_t *fragfrom, *fragto; - int from, to, todo; - - WARN_ON(shiftlen > skb->data_len); - - todo = shiftlen; - from = 0; - to = 0; - fragfrom = &skb_shinfo(skb)->frags[from]; - - while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { - fragfrom = &skb_shinfo(skb)->frags[from]; - fragto = &skb_shinfo(tgt)->frags[to]; - - if (todo >= skb_frag_size(fragfrom)) { - *fragto = *fragfrom; - todo -= skb_frag_size(fragfrom); - from++; - to++; - - } else { - __skb_frag_ref(fragfrom); - skb_frag_page_copy(fragto, fragfrom); - skb_frag_off_copy(fragto, fragfrom); - skb_frag_size_set(fragto, todo); - - skb_frag_off_add(fragfrom, todo); - skb_frag_size_sub(fragfrom, todo); - todo = 0; - - to++; - break; - } - } - - /* Ready to "commit" this state change to tgt */ - skb_shinfo(tgt)->nr_frags = to; - - /* Reposition in the original skb */ - to = 0; - while (from < skb_shinfo(skb)->nr_frags) - skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; - - skb_shinfo(skb)->nr_frags = to; - - WARN_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); - - skb->len -= shiftlen; - skb->data_len -= shiftlen; - skb->truesize -= shiftlen; - tgt->len += shiftlen; - tgt->data_len += shiftlen; - tgt->truesize += shiftlen; - - return shiftlen; -} - /* * chcr_ktls_xmit_wr_complete: This sends out the complete record. If an skb * received has partial end part of the record, send out the complete record, so @@ -1118,6 +1077,8 @@ static int chcr_ktls_skb_shift(struct sk_buff *tgt, struct sk_buff *skb, static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, struct chcr_ktls_info *tx_info, struct sge_eth_txq *q, u32 tcp_seq, + bool is_last_wr, u32 data_len, + u32 skb_offset, u32 nfrags, bool tcp_push, u32 mss) { u32 len16, wr_mid = 0, flits = 0, ndesc, cipher_start; @@ -1133,7 +1094,7 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, u64 *end; /* get the number of flits required */ - flits = chcr_ktls_get_tx_flits(skb, tx_info->key_ctx_len); + flits = chcr_ktls_get_tx_flits(nfrags, tx_info->key_ctx_len); /* number of descriptors */ ndesc = chcr_flits_to_desc(flits); /* check if enough credits available */ @@ -1162,6 +1123,9 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, return NETDEV_TX_BUSY; } + if (!is_last_wr) + skb_get(skb); + pos = &q->q.desc[q->q.pidx]; end = (u64 *)pos + flits; /* FW_ULPTX_WR */ @@ -1194,7 +1158,7 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, CPL_TX_SEC_PDU_CPLLEN_V(CHCR_CPL_TX_SEC_PDU_LEN_64BIT) | CPL_TX_SEC_PDU_PLACEHOLDER_V(1) | CPL_TX_SEC_PDU_IVINSRTOFST_V(TLS_HEADER_SIZE + 1)); - cpl->pldlen = htonl(skb->data_len); + cpl->pldlen = htonl(data_len); /* encryption should start after tls header size + iv size */ cipher_start = TLS_HEADER_SIZE + tx_info->iv_size + 1; @@ -1236,7 +1200,7 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, /* CPL_TX_DATA */ tx_data = (void *)pos; OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tx_info->tid)); - tx_data->len = htonl(TX_DATA_MSS_V(mss) | TX_LENGTH_V(skb->data_len)); + tx_data->len = htonl(TX_DATA_MSS_V(mss) | TX_LENGTH_V(data_len)); tx_data->rsvd = htonl(tcp_seq); @@ -1256,8 +1220,8 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, } /* send the complete packet except the header */ - cxgb4_write_sgl(skb, &q->q, pos, end, skb->len - skb->data_len, - sgl_sdesc->addr); + cxgb4_write_partial_sgl(skb, &q->q, pos, end, sgl_sdesc->addr, + skb_offset, data_len); sgl_sdesc->skb = skb; chcr_txq_advance(&q->q, ndesc); @@ -1289,10 +1253,11 @@ static int chcr_ktls_xmit_wr_short(struct sk_buff *skb, struct sge_eth_txq *q, u32 tcp_seq, bool tcp_push, u32 mss, u32 tls_rec_offset, u8 *prior_data, - u32 prior_data_len) + u32 prior_data_len, u32 data_len, + u32 skb_offset) { + u32 len16, wr_mid = 0, cipher_start, nfrags; struct adapter *adap = tx_info->adap; - u32 len16, wr_mid = 0, cipher_start; unsigned int flits = 0, ndesc; int credits, left, last_desc; struct tx_sw_desc *sgl_sdesc; @@ -1305,10 +1270,11 @@ static int chcr_ktls_xmit_wr_short(struct sk_buff *skb, void *pos; u64 *end; + nfrags = chcr_get_nfrags_to_send(skb, skb_offset, data_len); /* get the number of flits required, it's a partial record so 2 flits * (AES_BLOCK_SIZE) will be added. */ - flits = chcr_ktls_get_tx_flits(skb, tx_info->key_ctx_len) + 2; + flits = chcr_ktls_get_tx_flits(nfrags, tx_info->key_ctx_len) + 2; /* get the correct 8 byte IV of this record */ iv_record = cpu_to_be64(tx_info->iv + tx_info->record_no); /* If it's a middle record and not 16 byte aligned to run AES CTR, need @@ -1380,7 +1346,7 @@ static int chcr_ktls_xmit_wr_short(struct sk_buff *skb, htonl(CPL_TX_SEC_PDU_OPCODE_V(CPL_TX_SEC_PDU) | CPL_TX_SEC_PDU_CPLLEN_V(CHCR_CPL_TX_SEC_PDU_LEN_64BIT) | CPL_TX_SEC_PDU_IVINSRTOFST_V(1)); - cpl->pldlen = htonl(skb->data_len + AES_BLOCK_LEN + prior_data_len); + cpl->pldlen = htonl(data_len + AES_BLOCK_LEN + prior_data_len); cpl->aadstart_cipherstop_hi = htonl(CPL_TX_SEC_PDU_CIPHERSTART_V(cipher_start)); cpl->cipherstop_lo_authinsert = 0; @@ -1411,7 +1377,7 @@ static int chcr_ktls_xmit_wr_short(struct sk_buff *skb, tx_data = (void *)pos; OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tx_info->tid)); tx_data->len = htonl(TX_DATA_MSS_V(mss) | - TX_LENGTH_V(skb->data_len + prior_data_len)); + TX_LENGTH_V(data_len + prior_data_len)); tx_data->rsvd = htonl(tcp_seq); tx_data->flags = htonl(TX_BYPASS_F); if (tcp_push) @@ -1444,8 +1410,8 @@ static int chcr_ktls_xmit_wr_short(struct sk_buff *skb, if (prior_data_len) pos = chcr_copy_to_txd(prior_data, &q->q, pos, 16); /* send the complete packet except the header */ - cxgb4_write_sgl(skb, &q->q, pos, end, skb->len - skb->data_len, - sgl_sdesc->addr); + cxgb4_write_partial_sgl(skb, &q->q, pos, end, sgl_sdesc->addr, + skb_offset, data_len); sgl_sdesc->skb = skb; chcr_txq_advance(&q->q, ndesc); @@ -1473,6 +1439,7 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, struct sk_buff *skb, u32 tcp_seq, u32 mss, bool tcp_push, struct sge_eth_txq *q, u32 port_id, u8 *prior_data, + u32 data_len, u32 skb_offset, u32 prior_data_len) { int credits, left, len16, last_desc; @@ -1482,14 +1449,16 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, struct ulptx_idata *idata; struct ulp_txpkt *ulptx; struct fw_ulptx_wr *wr; - u32 wr_mid = 0; + u32 wr_mid = 0, nfrags; void *pos; u64 *end; flits = DIV_ROUND_UP(CHCR_PLAIN_TX_DATA_LEN, 8); - flits += chcr_sgl_len(skb_shinfo(skb)->nr_frags); + nfrags = chcr_get_nfrags_to_send(skb, skb_offset, data_len); + flits += chcr_sgl_len(nfrags); if (prior_data_len) flits += 2; + /* WR will need len16 */ len16 = DIV_ROUND_UP(flits, 2); /* check how many descriptors needed */ @@ -1542,7 +1511,7 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, tx_data = (struct cpl_tx_data *)(idata + 1); OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tx_info->tid)); tx_data->len = htonl(TX_DATA_MSS_V(mss) | - TX_LENGTH_V(skb->data_len + prior_data_len)); + TX_LENGTH_V(data_len + prior_data_len)); /* set tcp seq number */ tx_data->rsvd = htonl(tcp_seq); tx_data->flags = htonl(TX_BYPASS_F); @@ -1566,8 +1535,8 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, end = pos + left; } /* send the complete packet including the header */ - cxgb4_write_sgl(skb, &q->q, pos, end, skb->len - skb->data_len, - sgl_sdesc->addr); + cxgb4_write_partial_sgl(skb, &q->q, pos, end, sgl_sdesc->addr, + skb_offset, data_len); sgl_sdesc->skb = skb; chcr_txq_advance(&q->q, ndesc); @@ -1578,9 +1547,11 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, /* * chcr_ktls_copy_record_in_skb * @nskb - new skb where the frags to be added. + * @skb - old skb, to copy socket and destructor details. * @record - specific record which has complete 16k record in frags. */ static void chcr_ktls_copy_record_in_skb(struct sk_buff *nskb, + struct sk_buff *skb, struct tls_record_info *record) { int i = 0; @@ -1595,6 +1566,9 @@ static void chcr_ktls_copy_record_in_skb(struct sk_buff *nskb, nskb->data_len = record->len; nskb->len += record->len; nskb->truesize += record->len; + nskb->sk = skb->sk; + nskb->destructor = skb->destructor; + refcount_add(nskb->truesize, &nskb->sk->sk_wmem_alloc); } /* @@ -1666,7 +1640,7 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, struct sk_buff *skb, struct tls_record_info *record, u32 tcp_seq, int mss, bool tcp_push_no_fin, - struct sge_eth_txq *q, + struct sge_eth_txq *q, u32 skb_offset, u32 tls_end_offset, bool last_wr) { struct sk_buff *nskb = NULL; @@ -1675,13 +1649,14 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, nskb = skb; atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_complete_pkts); } else { - dev_kfree_skb_any(skb); - - nskb = alloc_skb(0, GFP_KERNEL); - if (!nskb) + nskb = alloc_skb(0, GFP_ATOMIC); + if (!nskb) { + dev_kfree_skb_any(skb); return NETDEV_TX_BUSY; + } + /* copy complete record in skb */ - chcr_ktls_copy_record_in_skb(nskb, record); + chcr_ktls_copy_record_in_skb(nskb, skb, record); /* packet is being sent from the beginning, update the tcp_seq * accordingly. */ @@ -1691,10 +1666,20 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, */ if (chcr_ktls_update_snd_una(tx_info, q)) goto out; + /* reset skb offset */ + skb_offset = 0; + + if (last_wr) + dev_kfree_skb_any(skb); + + last_wr = true; + atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_end_pkts); } if (chcr_ktls_xmit_wr_complete(nskb, tx_info, q, tcp_seq, + last_wr, record->len, skb_offset, + record->num_frags, (last_wr && tcp_push_no_fin), mss)) { goto out; @@ -1730,41 +1715,32 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, struct sk_buff *skb, struct tls_record_info *record, u32 tcp_seq, int mss, bool tcp_push_no_fin, + u32 data_len, u32 skb_offset, struct sge_eth_txq *q, u32 tls_end_offset) { u32 tls_rec_offset = tcp_seq - tls_record_start_seq(record); u8 prior_data[16] = {0}; u32 prior_data_len = 0; - u32 data_len; /* check if the skb is ending in middle of tag/HASH, its a big * trouble, send the packet before the HASH. */ - int remaining_record = tls_end_offset - skb->data_len; + int remaining_record = tls_end_offset - data_len; if (remaining_record > 0 && remaining_record < TLS_CIPHER_AES_GCM_128_TAG_SIZE) { - int trimmed_len = skb->data_len - + int trimmed_len = data_len - (TLS_CIPHER_AES_GCM_128_TAG_SIZE - remaining_record); - struct sk_buff *tmp_skb = NULL; /* don't process the pkt if it is only a partial tag */ - if (skb->data_len < TLS_CIPHER_AES_GCM_128_TAG_SIZE) + if (data_len < TLS_CIPHER_AES_GCM_128_TAG_SIZE) goto out; - WARN_ON(trimmed_len > skb->data_len); - - /* shift to those many bytes */ - tmp_skb = alloc_skb(0, GFP_KERNEL); - if (unlikely(!tmp_skb)) - goto out; + WARN_ON(trimmed_len > data_len); - chcr_ktls_skb_shift(tmp_skb, skb, trimmed_len); - /* free the last trimmed portion */ - dev_kfree_skb_any(skb); - skb = tmp_skb; + data_len = trimmed_len; atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_trimmed_pkts); } - data_len = skb->data_len; + /* check if the middle record's start point is 16 byte aligned. CTR * needs 16 byte aligned start point to start encryption. */ @@ -1825,9 +1801,6 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, } /* reset tcp_seq as per the prior_data_required len */ tcp_seq -= prior_data_len; - /* include prio_data_len for further calculation. - */ - data_len += prior_data_len; } /* reset snd una, so the middle record won't send the already * sent part. @@ -1844,6 +1817,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, tcp_push_no_fin, q, tx_info->port_id, prior_data, + data_len, skb_offset, prior_data_len)) { goto out; } @@ -1854,7 +1828,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, if (chcr_ktls_xmit_wr_short(skb, tx_info, q, tcp_seq, tcp_push_no_fin, mss, tls_rec_offset, prior_data, - prior_data_len)) { + prior_data_len, data_len, skb_offset)) { goto out; } @@ -1876,7 +1850,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) struct tls_record_info *record; struct chcr_ktls_info *tx_info; struct tls_context *tls_ctx; - struct sk_buff *local_skb; struct sge_eth_txq *q; struct adapter *adap; unsigned long flags; @@ -1898,14 +1871,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(!tx_info)) goto out; - /* don't touch the original skb, make a new skb to extract each records - * and send them separately. - */ - local_skb = alloc_skb(0, GFP_KERNEL); - - if (unlikely(!local_skb)) - return NETDEV_TX_BUSY; - adap = tx_info->adap; stats = &adap->ch_ktls_stats; port_stats = &stats->ktls_port[tx_info->port_id]; @@ -1925,13 +1890,9 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) ntohl(th->ack_seq), ntohs(th->window)); if (ret) { - dev_kfree_skb_any(local_skb); return NETDEV_TX_BUSY; } - /* copy skb contents into local skb */ - chcr_ktls_skb_copy(skb, local_skb); - /* TCP segments can be in received either complete or partial. * chcr_end_part_handler will handle cases if complete record or end * part of the record is received. Incase of partial end part of record, @@ -1961,7 +1922,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) atomic64_inc(&port_stats->ktls_tx_skip_no_sync_data); goto out; } - /* increase page reference count of the record, so that there * won't be any chance of page free in middle if in case stack * receives ACK and try to delete the record. @@ -1977,44 +1937,28 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) tcp_seq, record->end_seq, tx_info->prev_seq, data_len); /* if a tls record is finishing in this SKB */ if (tls_end_offset <= data_len) { - struct sk_buff *nskb = NULL; - - if (tls_end_offset < data_len) { - nskb = alloc_skb(0, GFP_KERNEL); - if (unlikely(!nskb)) { - ret = -ENOMEM; - goto clear_ref; - } - - chcr_ktls_skb_shift(nskb, local_skb, - tls_end_offset); - } else { - /* its the only record in this skb, directly - * point it. - */ - nskb = local_skb; - } - ret = chcr_end_part_handler(tx_info, nskb, record, + ret = chcr_end_part_handler(tx_info, skb, record, tcp_seq, mss, (!th->fin && th->psh), q, + skb_offset, tls_end_offset, - (nskb == local_skb)); - - if (ret && nskb != local_skb) - dev_kfree_skb_any(local_skb); + skb_offset + + tls_end_offset == skb->len); data_len -= tls_end_offset; /* tcp_seq increment is required to handle next record. */ tcp_seq += tls_end_offset; + skb_offset += tls_end_offset; } else { - ret = chcr_short_record_handler(tx_info, local_skb, + ret = chcr_short_record_handler(tx_info, skb, record, tcp_seq, mss, (!th->fin && th->psh), + data_len, skb_offset, q, tls_end_offset); data_len = 0; } -clear_ref: + /* clear the frag ref count which increased locally before */ for (i = 0; i < record->num_frags; i++) { /* clear the frag ref count */ @@ -2022,7 +1966,8 @@ clear_ref: } /* if any failure, come out from the loop. */ if (ret) - goto out; + return NETDEV_TX_OK; + /* length should never be less than 0 */ WARN_ON(data_len < 0); @@ -2038,6 +1983,7 @@ clear_ref: if (th->fin) chcr_ktls_write_tcp_options(tx_info, skb, q, tx_info->tx_chan); + return NETDEV_TX_OK; out: dev_kfree_skb_any(skb); return NETDEV_TX_OK; -- cgit v1.2.3 From c68a28a9e2798a4602dde1c77046a3b577eb31f4 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:35 +0530 Subject: ch_ktls: Correction in trimmed_len calculation trimmed length calculation goes wrong if skb has only tag part to send. It should be zero if there is no data bytes apart from TAG. Fixes: dc05f3df8fac ("chcr: Handle first or middle part of record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 950841988ffe..4286decce095 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1729,10 +1729,13 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, if (remaining_record > 0 && remaining_record < TLS_CIPHER_AES_GCM_128_TAG_SIZE) { - int trimmed_len = data_len - - (TLS_CIPHER_AES_GCM_128_TAG_SIZE - remaining_record); - /* don't process the pkt if it is only a partial tag */ - if (data_len < TLS_CIPHER_AES_GCM_128_TAG_SIZE) + int trimmed_len = 0; + + if (tls_end_offset > TLS_CIPHER_AES_GCM_128_TAG_SIZE) + trimmed_len = data_len - + (TLS_CIPHER_AES_GCM_128_TAG_SIZE - + remaining_record); + if (!trimmed_len) goto out; WARN_ON(trimmed_len > data_len); -- cgit v1.2.3 From 83deb094dd5c636a790da3914008570c9fd1693f Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:36 +0530 Subject: ch_ktls: missing handling of header alone If an skb has only header part which doesn't start from beginning, is not being handled properly. Fixes: dc05f3df8fac ("chcr: Handle first or middle part of record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 25 ++++++++++------------ 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 4286decce095..8a54fce9bfae 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1744,6 +1744,17 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_trimmed_pkts); } + /* check if it is only the header part. */ + if (tls_rec_offset + data_len <= (TLS_HEADER_SIZE + tx_info->iv_size)) { + if (chcr_ktls_tx_plaintxt(tx_info, skb, tcp_seq, mss, + tcp_push_no_fin, q, + tx_info->port_id, prior_data, + data_len, skb_offset, prior_data_len)) + goto out; + + return 0; + } + /* check if the middle record's start point is 16 byte aligned. CTR * needs 16 byte aligned start point to start encryption. */ @@ -1812,20 +1823,6 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, goto out; atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_middle_pkts); } else { - /* Else means, its a partial first part of the record. Check if - * its only the header, don't need to send for encryption then. - */ - if (data_len <= TLS_HEADER_SIZE + tx_info->iv_size) { - if (chcr_ktls_tx_plaintxt(tx_info, skb, tcp_seq, mss, - tcp_push_no_fin, q, - tx_info->port_id, - prior_data, - data_len, skb_offset, - prior_data_len)) { - goto out; - } - return 0; - } atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_start_pkts); } -- cgit v1.2.3 From 63ee4591fa2f97dc08ce37514f214fc0430e9dc3 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:37 +0530 Subject: ch_ktls: Correction in middle record handling If a record starts in middle, reset TCB UNA so that we could avoid sending out extra packet which is needed to make it 16 byte aligned to start AES CTR. Check also considers prev_seq, which should be what is actually sent, not the skb data length. Avoid updating partial TAG to HW at any point of time, that's why we need to check if remaining part is smaller than TAG size, then reset TX_MAX to be TAG starting sequence number. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 50 ++++++++++++++-------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 8a54fce9bfae..026c66599d1e 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -827,7 +827,7 @@ static void *chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, */ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, struct sge_eth_txq *q, u64 tcp_seq, - u64 tcp_ack, u64 tcp_win) + u64 tcp_ack, u64 tcp_win, bool offset) { bool first_wr = ((tx_info->prev_ack == 0) && (tx_info->prev_win == 0)); struct ch_ktls_port_stats_debug *port_stats; @@ -862,7 +862,7 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, cpl++; } /* reset snd una if it's a re-transmit pkt */ - if (tcp_seq != tx_info->prev_seq) { + if (tcp_seq != tx_info->prev_seq || offset) { /* reset snd_una */ port_stats = &tx_info->adap->ch_ktls_stats.ktls_port[tx_info->port_id]; @@ -871,7 +871,8 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, TCB_SND_UNA_RAW_V (TCB_SND_UNA_RAW_M), TCB_SND_UNA_RAW_V(0), 0); - atomic64_inc(&port_stats->ktls_tx_ooo); + if (tcp_seq != tx_info->prev_seq) + atomic64_inc(&port_stats->ktls_tx_ooo); cpl++; } /* update ack */ @@ -1661,11 +1662,6 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, * accordingly. */ tcp_seq = tls_record_start_seq(record); - /* reset snd una, so the middle record won't send the already - * sent part. - */ - if (chcr_ktls_update_snd_una(tx_info, q)) - goto out; /* reset skb offset */ skb_offset = 0; @@ -1684,6 +1680,7 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, mss)) { goto out; } + tx_info->prev_seq = record->end_seq; return 0; out: dev_kfree_skb_any(nskb); @@ -1752,6 +1749,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, data_len, skb_offset, prior_data_len)) goto out; + tx_info->prev_seq = tcp_seq + data_len; return 0; } @@ -1832,6 +1830,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, goto out; } + tx_info->prev_seq = tcp_seq + data_len + prior_data_len; return 0; out: dev_kfree_skb_any(skb); @@ -1885,13 +1884,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) if (ret) return NETDEV_TX_BUSY; } - /* update tcb */ - ret = chcr_ktls_xmit_tcb_cpls(tx_info, q, ntohl(th->seq), - ntohl(th->ack_seq), - ntohs(th->window)); - if (ret) { - return NETDEV_TX_BUSY; - } /* TCP segments can be in received either complete or partial. * chcr_end_part_handler will handle cases if complete record or end @@ -1922,6 +1914,30 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) atomic64_inc(&port_stats->ktls_tx_skip_no_sync_data); goto out; } + tls_end_offset = record->end_seq - tcp_seq; + + pr_debug("seq 0x%x, end_seq 0x%x prev_seq 0x%x, datalen 0x%x\n", + tcp_seq, record->end_seq, tx_info->prev_seq, data_len); + /* update tcb for the skb */ + if (skb_data_len == data_len) { + u32 tx_max = tcp_seq; + + if (!tls_record_is_start_marker(record) && + tls_end_offset < TLS_CIPHER_AES_GCM_128_TAG_SIZE) + tx_max = record->end_seq - + TLS_CIPHER_AES_GCM_128_TAG_SIZE; + + ret = chcr_ktls_xmit_tcb_cpls(tx_info, q, tx_max, + ntohl(th->ack_seq), + ntohs(th->window), + tls_end_offset != + record->len); + if (ret) { + spin_unlock_irqrestore(&tx_ctx->base.lock, + flags); + goto out; + } + } /* increase page reference count of the record, so that there * won't be any chance of page free in middle if in case stack * receives ACK and try to delete the record. @@ -1931,10 +1947,7 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) /* lock cleared */ spin_unlock_irqrestore(&tx_ctx->base.lock, flags); - tls_end_offset = record->end_seq - tcp_seq; - pr_debug("seq 0x%x, end_seq 0x%x prev_seq 0x%x, datalen 0x%x\n", - tcp_seq, record->end_seq, tx_info->prev_seq, data_len); /* if a tls record is finishing in this SKB */ if (tls_end_offset <= data_len) { ret = chcr_end_part_handler(tx_info, skb, record, @@ -1973,7 +1986,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) } while (data_len > 0); - tx_info->prev_seq = ntohl(th->seq) + skb_data_len; atomic64_inc(&port_stats->ktls_tx_encrypted_packets); atomic64_add(skb_data_len, &port_stats->ktls_tx_encrypted_bytes); -- cgit v1.2.3 From 9478e083941c873d60a97b232760a14dec6c69d3 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:38 +0530 Subject: ch_ktls: packet handling prior to start marker There could be a case where ACK for tls exchanges prior to start marker is missed out, and by the time tls is offloaded. This pkt should not be discarded and handled carefully. It could be plaintext alone or plaintext + finish as well. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 38 +++++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 026c66599d1e..bbda71b7f98b 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1909,11 +1909,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) goto out; } - if (unlikely(tls_record_is_start_marker(record))) { - spin_unlock_irqrestore(&tx_ctx->base.lock, flags); - atomic64_inc(&port_stats->ktls_tx_skip_no_sync_data); - goto out; - } tls_end_offset = record->end_seq - tcp_seq; pr_debug("seq 0x%x, end_seq 0x%x prev_seq 0x%x, datalen 0x%x\n", @@ -1938,6 +1933,39 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) goto out; } } + + if (unlikely(tls_record_is_start_marker(record))) { + atomic64_inc(&port_stats->ktls_tx_skip_no_sync_data); + /* If tls_end_offset < data_len, means there is some + * data after start marker, which needs encryption, send + * plaintext first and take skb refcount. else send out + * complete pkt as plaintext. + */ + if (tls_end_offset < data_len) + skb_get(skb); + else + tls_end_offset = data_len; + + ret = chcr_ktls_tx_plaintxt(tx_info, skb, tcp_seq, mss, + (!th->fin && th->psh), q, + tx_info->port_id, NULL, + tls_end_offset, skb_offset, + 0); + + spin_unlock_irqrestore(&tx_ctx->base.lock, flags); + if (ret) { + /* free the refcount taken earlier */ + if (tls_end_offset < data_len) + dev_kfree_skb_any(skb); + goto out; + } + + data_len -= tls_end_offset; + tcp_seq = record->end_seq; + skb_offset += tls_end_offset; + continue; + } + /* increase page reference count of the record, so that there * won't be any chance of page free in middle if in case stack * receives ACK and try to delete the record. -- cgit v1.2.3 From 659bf0383d15b07e492e27443d87736b24171558 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:39 +0530 Subject: ch_ktls: don't free skb before sending FIN If its a last packet and fin is set. Make sure FIN is informed to HW before skb gets freed. Fixes: 429765a149f1 ("chcr: handle partial end part of a record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index bbda71b7f98b..a8062e038ebc 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1932,6 +1932,9 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) flags); goto out; } + + if (th->fin) + skb_get(skb); } if (unlikely(tls_record_is_start_marker(record))) { @@ -2006,8 +2009,11 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) __skb_frag_unref(&record->frags[i]); } /* if any failure, come out from the loop. */ - if (ret) + if (ret) { + if (th->fin) + dev_kfree_skb_any(skb); return NETDEV_TX_OK; + } /* length should never be less than 0 */ WARN_ON(data_len < 0); @@ -2020,8 +2026,10 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) /* tcp finish is set, send a separate tcp msg including all the options * as well. */ - if (th->fin) + if (th->fin) { chcr_ktls_write_tcp_options(tx_info, skb, q, tx_info->tx_chan); + dev_kfree_skb_any(skb); + } return NETDEV_TX_OK; out: -- cgit v1.2.3 From 21f82acbb8b4e8812521d405479b6fc3790078de Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:40 +0530 Subject: ch_ktls/cxgb4: handle partial tag alone SKBs If TCP congestion caused a very small packets which only has some part fo the TAG, and that too is not till the end. HW can't handle such case, so falling back to sw crypto in such cases. v1->v2: - Marked chcr_ktls_sw_fallback() static. Fixes: dc05f3df8fac ("chcr: Handle first or middle part of record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 2 + drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 1 + .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 116 ++++++++++++++++++++- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.h | 1 + 4 files changed, 119 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index 0273f40b85f7..17410fe86626 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -3573,6 +3573,8 @@ static int chcr_stats_show(struct seq_file *seq, void *v) atomic64_read(&adap->ch_ktls_stats.ktls_tx_complete_pkts)); seq_printf(seq, "TX trim pkts : %20llu\n", atomic64_read(&adap->ch_ktls_stats.ktls_tx_trimmed_pkts)); + seq_printf(seq, "TX sw fallback : %20llu\n", + atomic64_read(&adap->ch_ktls_stats.ktls_tx_fallback)); while (i < MAX_NPORTS) { ktls_port = &adap->ch_ktls_stats.ktls_port[i]; seq_printf(seq, "Port %d\n", i); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index e2a4941fa802..1b49f2fa9b18 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -388,6 +388,7 @@ struct ch_ktls_stats_debug { atomic64_t ktls_tx_retransmit_pkts; atomic64_t ktls_tx_complete_pkts; atomic64_t ktls_tx_trimmed_pkts; + atomic64_t ktls_tx_fallback; }; #endif diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index a8062e038ebc..b182c940b4a0 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1545,6 +1545,88 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, return 0; } +static int chcr_ktls_tunnel_pkt(struct chcr_ktls_info *tx_info, + struct sk_buff *skb, + struct sge_eth_txq *q) +{ + u32 ctrl, iplen, maclen, wr_mid = 0, len16; + struct tx_sw_desc *sgl_sdesc; + struct fw_eth_tx_pkt_wr *wr; + struct cpl_tx_pkt_core *cpl; + unsigned int flits, ndesc; + int credits, last_desc; + u64 cntrl1, *end; + void *pos; + + ctrl = sizeof(*cpl); + flits = DIV_ROUND_UP(sizeof(*wr) + ctrl, 8); + + flits += chcr_sgl_len(skb_shinfo(skb)->nr_frags + 1); + len16 = DIV_ROUND_UP(flits, 2); + /* check how many descriptors needed */ + ndesc = DIV_ROUND_UP(flits, 8); + + credits = chcr_txq_avail(&q->q) - ndesc; + if (unlikely(credits < 0)) { + chcr_eth_txq_stop(q); + return -ENOMEM; + } + + if (unlikely(credits < ETHTXQ_STOP_THRES)) { + chcr_eth_txq_stop(q); + wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; + } + + last_desc = q->q.pidx + ndesc - 1; + if (last_desc >= q->q.size) + last_desc -= q->q.size; + sgl_sdesc = &q->q.sdesc[last_desc]; + + if (unlikely(cxgb4_map_skb(tx_info->adap->pdev_dev, skb, + sgl_sdesc->addr) < 0)) { + memset(sgl_sdesc->addr, 0, sizeof(sgl_sdesc->addr)); + q->mapping_err++; + return -ENOMEM; + } + + iplen = skb_network_header_len(skb); + maclen = skb_mac_header_len(skb); + + pos = &q->q.desc[q->q.pidx]; + end = (u64 *)pos + flits; + wr = pos; + + /* Firmware work request header */ + wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) | + FW_WR_IMMDLEN_V(ctrl)); + + wr->equiq_to_len16 = htonl(wr_mid | FW_WR_LEN16_V(len16)); + wr->r3 = 0; + + cpl = (void *)(wr + 1); + + /* CPL header */ + cpl->ctrl0 = htonl(TXPKT_OPCODE_V(CPL_TX_PKT) | + TXPKT_INTF_V(tx_info->tx_chan) | + TXPKT_PF_V(tx_info->adap->pf)); + cpl->pack = 0; + cntrl1 = TXPKT_CSUM_TYPE_V(tx_info->ip_family == AF_INET ? + TX_CSUM_TCPIP : TX_CSUM_TCPIP6); + cntrl1 |= T6_TXPKT_ETHHDR_LEN_V(maclen - ETH_HLEN) | + TXPKT_IPHDR_LEN_V(iplen); + /* checksum offload */ + cpl->ctrl1 = cpu_to_be64(cntrl1); + cpl->len = htons(skb->len); + + pos = cpl + 1; + + cxgb4_write_sgl(skb, &q->q, pos, end, 0, sgl_sdesc->addr); + sgl_sdesc->skb = skb; + chcr_txq_advance(&q->q, ndesc); + cxgb4_ring_tx_db(tx_info->adap, &q->q, ndesc); + return 0; +} + /* * chcr_ktls_copy_record_in_skb * @nskb - new skb where the frags to be added. @@ -1733,7 +1815,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, (TLS_CIPHER_AES_GCM_128_TAG_SIZE - remaining_record); if (!trimmed_len) - goto out; + return FALLBACK; WARN_ON(trimmed_len > data_len); @@ -1837,6 +1919,34 @@ out: return NETDEV_TX_BUSY; } +static int chcr_ktls_sw_fallback(struct sk_buff *skb, + struct chcr_ktls_info *tx_info, + struct sge_eth_txq *q) +{ + u32 data_len, skb_offset; + struct sk_buff *nskb; + struct tcphdr *th; + + nskb = tls_encrypt_skb(skb); + + if (!nskb) + return 0; + + th = tcp_hdr(nskb); + skb_offset = skb_transport_offset(nskb) + tcp_hdrlen(nskb); + data_len = nskb->len - skb_offset; + skb_tx_timestamp(nskb); + + if (chcr_ktls_tunnel_pkt(tx_info, nskb, q)) + goto out; + + tx_info->prev_seq = ntohl(th->seq) + data_len; + atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_fallback); + return 0; +out: + dev_kfree_skb_any(nskb); + return 0; +} /* nic tls TX handler */ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -2012,6 +2122,10 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) if (ret) { if (th->fin) dev_kfree_skb_any(skb); + + if (ret == FALLBACK) + return chcr_ktls_sw_fallback(skb, tx_info, q); + return NETDEV_TX_OK; } diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.h b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.h index c1651b1431a0..18b3b1f02415 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.h @@ -26,6 +26,7 @@ #define CHCR_KTLS_WR_SIZE (CHCR_PLAIN_TX_DATA_LEN +\ sizeof(struct cpl_tx_sec_pdu)) +#define FALLBACK 35 enum ch_ktls_open_state { CH_KTLS_OPEN_SUCCESS = 0, -- cgit v1.2.3 From 7d01c428c86b525dc780226924d74df2048cf411 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:41 +0530 Subject: ch_ktls: tcb update fails sometimes context id and port id should be filled while sending tcb update. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index b182c940b4a0..a732051b21e4 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -733,7 +733,8 @@ static int chcr_ktls_cpl_set_tcb_rpl(struct adapter *adap, unsigned char *input) } static void *__chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, - u32 tid, void *pos, u16 word, u64 mask, + u32 tid, void *pos, u16 word, + struct sge_eth_txq *q, u64 mask, u64 val, u32 reply) { struct cpl_set_tcb_field_core *cpl; @@ -742,7 +743,10 @@ static void *__chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, /* ULP_TXPKT */ txpkt = pos; - txpkt->cmd_dest = htonl(ULPTX_CMD_V(ULP_TX_PKT) | ULP_TXPKT_DEST_V(0)); + txpkt->cmd_dest = htonl(ULPTX_CMD_V(ULP_TX_PKT) | + ULP_TXPKT_CHANNELID_V(tx_info->port_id) | + ULP_TXPKT_FID_V(q->q.cntxt_id) | + ULP_TXPKT_RO_F); txpkt->len = htonl(DIV_ROUND_UP(CHCR_SET_TCB_FIELD_LEN, 16)); /* ULPTX_IDATA sub-command */ @@ -797,7 +801,7 @@ static void *chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, } else { u8 buf[48] = {0}; - __chcr_write_cpl_set_tcb_ulp(tx_info, tid, buf, word, + __chcr_write_cpl_set_tcb_ulp(tx_info, tid, buf, word, q, mask, val, reply); return chcr_copy_to_txd(buf, &q->q, pos, @@ -805,7 +809,7 @@ static void *chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, } } - pos = __chcr_write_cpl_set_tcb_ulp(tx_info, tid, pos, word, + pos = __chcr_write_cpl_set_tcb_ulp(tx_info, tid, pos, word, q, mask, val, reply); /* check again if we are at the end of the queue */ -- cgit v1.2.3 From 83a95df04bee77c74df5151c961b19d870a70180 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:42 +0530 Subject: ch_ktls: stop the txq if reaches threshold Stop the queue and ask for the credits if queue reaches to threashold. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index a732051b21e4..c24485c0d512 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -835,7 +835,7 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, { bool first_wr = ((tx_info->prev_ack == 0) && (tx_info->prev_win == 0)); struct ch_ktls_port_stats_debug *port_stats; - u32 len, cpl = 0, ndesc, wr_len; + u32 len, cpl = 0, ndesc, wr_len, wr_mid = 0; struct fw_ulptx_wr *wr; int credits; void *pos; @@ -851,6 +851,11 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, return NETDEV_TX_BUSY; } + if (unlikely(credits < ETHTXQ_STOP_THRES)) { + chcr_eth_txq_stop(q); + wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; + } + pos = &q->q.desc[q->q.pidx]; /* make space for WR, we'll fill it later when we know all the cpls * being sent out and have complete length. @@ -905,7 +910,8 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, wr->op_to_compl = htonl(FW_WR_OP_V(FW_ULPTX_WR)); wr->cookie = 0; /* fill len in wr field */ - wr->flowid_len16 = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(len, 16))); + wr->flowid_len16 = htonl(wr_mid | + FW_WR_LEN16_V(DIV_ROUND_UP(len, 16))); ndesc = DIV_ROUND_UP(len, 64); chcr_txq_advance(&q->q, ndesc); @@ -986,6 +992,7 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, struct tcphdr *tcp; int len16, pktlen; struct iphdr *ip; + u32 wr_mid = 0; int credits; u8 buf[150]; u64 cntrl1; @@ -1010,6 +1017,11 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, return NETDEV_TX_BUSY; } + if (unlikely(credits < ETHTXQ_STOP_THRES)) { + chcr_eth_txq_stop(q); + wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; + } + pos = &q->q.desc[q->q.pidx]; wr = pos; @@ -1017,7 +1029,7 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) | FW_WR_IMMDLEN_V(ctrl)); - wr->equiq_to_len16 = htonl(FW_WR_LEN16_V(len16)); + wr->equiq_to_len16 = htonl(wr_mid | FW_WR_LEN16_V(len16)); wr->r3 = 0; cpl = (void *)(wr + 1); -- cgit v1.2.3 From 460cd17e9f7d60eaa22028baa6a056c478fa7dc6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Nov 2020 19:51:20 -0800 Subject: net: switch to the kernel.org patchwork instance Move to the kernel.org patchwork instance, it has significantly lower latency for accessing from Europe and the US. Other quirks include the reply bot. Link: https://lore.kernel.org/r/20201110035120.642746-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdev-FAQ.rst | 4 ++-- Documentation/process/stable-kernel-rules.rst | 2 +- .../it_IT/process/stable-kernel-rules.rst | 2 +- MAINTAINERS | 20 ++++++++++---------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index d5c9320901c3..21537766be4d 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -110,7 +110,7 @@ Q: I sent a patch and I'm wondering what happened to it? Q: How can I tell whether it got merged? A: Start by looking at the main patchworks queue for netdev: - http://patchwork.ozlabs.org/project/netdev/list/ + https://patchwork.kernel.org/project/netdevbpf/list/ The "State" field will tell you exactly where things are at with your patch. @@ -152,7 +152,7 @@ networking subsystem, and then hands them off to Greg. There is a patchworks queue that you can see here: - http://patchwork.ozlabs.org/bundle/davem/stable/?state=* + https://patchwork.kernel.org/bundle/netdev/stable/?state=* It contains the patches which Dave has selected, but not yet handed off to Greg. If Greg already has the patch, then it will be here: diff --git a/Documentation/process/stable-kernel-rules.rst b/Documentation/process/stable-kernel-rules.rst index 06f743b612c4..3973556250e1 100644 --- a/Documentation/process/stable-kernel-rules.rst +++ b/Documentation/process/stable-kernel-rules.rst @@ -39,7 +39,7 @@ Procedure for submitting patches to the -stable tree submission guidelines as described in :ref:`Documentation/networking/netdev-FAQ.rst ` after first checking the stable networking queue at - https://patchwork.ozlabs.org/bundle/davem/stable/?series=&submitter=&state=*&q=&archive= + https://patchwork.kernel.org/bundle/netdev/stable/?state=* to ensure the requested patch is not already queued up. - Security patches should not be handled (solely) by the -stable review process but should follow the procedures in diff --git a/Documentation/translations/it_IT/process/stable-kernel-rules.rst b/Documentation/translations/it_IT/process/stable-kernel-rules.rst index 4f206cee31a7..283d62541c4f 100644 --- a/Documentation/translations/it_IT/process/stable-kernel-rules.rst +++ b/Documentation/translations/it_IT/process/stable-kernel-rules.rst @@ -46,7 +46,7 @@ Procedura per sottomettere patch per i sorgenti -stable :ref:`Documentation/translations/it_IT/networking/netdev-FAQ.rst `; ma solo dopo aver verificato al seguente indirizzo che la patch non sia già in coda: - https://patchwork.ozlabs.org/bundle/davem/stable/?series=&submitter=&state=*&q=&archive= + https://patchwork.kernel.org/bundle/netdev/stable/?state=* - Una patch di sicurezza non dovrebbero essere gestite (solamente) dal processo di revisione -stable, ma dovrebbe seguire le procedure descritte in :ref:`Documentation/translations/it_IT/admin-guide/security-bugs.rst `. diff --git a/MAINTAINERS b/MAINTAINERS index fa11fe773df5..eaded39a3858 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1279,7 +1279,7 @@ M: Igor Russkikh L: netdev@vger.kernel.org S: Supported W: https://www.marvell.com/ -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: Documentation/networking/device_drivers/ethernet/aquantia/atlantic.rst F: drivers/net/ethernet/aquantia/atlantic/ @@ -11173,7 +11173,7 @@ M: Tariq Toukan L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx4/en_* MELLANOX ETHERNET DRIVER (mlx5e) @@ -11181,7 +11181,7 @@ M: Saeed Mahameed L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx5/core/en_* MELLANOX ETHERNET INNOVA DRIVERS @@ -11189,7 +11189,7 @@ R: Boris Pismenny L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx5/core/accel/* F: drivers/net/ethernet/mellanox/mlx5/core/en_accel/* F: drivers/net/ethernet/mellanox/mlx5/core/fpga/* @@ -11201,7 +11201,7 @@ M: Ido Schimmel L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlxsw/ F: tools/testing/selftests/drivers/net/mlxsw/ @@ -11210,7 +11210,7 @@ M: mlxsw@nvidia.com L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlxfw/ MELLANOX HARDWARE PLATFORM SUPPORT @@ -11229,7 +11229,7 @@ L: netdev@vger.kernel.org L: linux-rdma@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx4/ F: include/linux/mlx4/ @@ -11250,7 +11250,7 @@ L: netdev@vger.kernel.org L: linux-rdma@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: Documentation/networking/device_drivers/ethernet/mellanox/ F: drivers/net/ethernet/mellanox/mlx5/core/ F: include/linux/mlx5/ @@ -12130,7 +12130,7 @@ M: Jakub Kicinski L: netdev@vger.kernel.org S: Maintained W: http://www.linuxfoundation.org/en/Net -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git F: Documentation/devicetree/bindings/net/ @@ -12175,7 +12175,7 @@ M: Jakub Kicinski L: netdev@vger.kernel.org S: Maintained W: http://www.linuxfoundation.org/en/Net -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ B: mailto:netdev@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git -- cgit v1.2.3 From 52755b66ddcef2e897778fac5656df18817b59ab Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 10 Nov 2020 22:46:14 +0800 Subject: cosa: Add missing kfree in error path of cosa_write If memory allocation for 'kbuf' succeed, cosa_write() doesn't have a corresponding kfree() in exception handling. Thus add kfree() for this function implementation. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Hulk Robot Signed-off-by: Wang Hai Acked-by: Jan "Yenya" Kasprzak Link: https://lore.kernel.org/r/20201110144614.43194-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/cosa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index f8aed0696d77..2369ca250cd6 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -889,6 +889,7 @@ static ssize_t cosa_write(struct file *file, chan->tx_status = 1; spin_unlock_irqrestore(&cosa->lock, flags); up(&chan->wsem); + kfree(kbuf); return -ERESTARTSYS; } } -- cgit v1.2.3 From 70438afbf17e5194dd607dd17759560a363b7bb4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 21 Oct 2020 10:34:15 -0400 Subject: NFSv4.2: fix failure to unregister shrinker We forgot to unregister the nfs4_xattr_large_entry_shrinker. That leaves the global list of shrinkers corrupted after unload of the nfs module, after which possibly unrelated code that calls register_shrinker() or unregister_shrinker() gets a BUG() with "supervisor write access in kernel mode". And similarly for the nfs4_xattr_large_entry_lru. Reported-by: Kris Karas Tested-By: Kris Karas Fixes: 95ad37f90c33 "NFSv4.2: add client side xattr caching." Signed-off-by: J. Bruce Fields CC: stable@vger.kernel.org Signed-off-by: Anna Schumaker --- fs/nfs/nfs42xattr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index b51424ff8159..6c2ce799150f 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -1047,8 +1047,10 @@ out4: void nfs4_xattr_cache_exit(void) { + unregister_shrinker(&nfs4_xattr_large_entry_shrinker); unregister_shrinker(&nfs4_xattr_entry_shrinker); unregister_shrinker(&nfs4_xattr_cache_shrinker); + list_lru_destroy(&nfs4_xattr_large_entry_lru); list_lru_destroy(&nfs4_xattr_entry_lru); list_lru_destroy(&nfs4_xattr_cache_lru); kmem_cache_destroy(nfs4_xattr_cache_cachep); -- cgit v1.2.3 From 6c2190b3fcbc92cb79e39cc7e7531656b341e463 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 31 Oct 2020 12:44:25 -0400 Subject: NFS: Fix listxattr receive buffer size Certain NFSv4.2/RDMA tests fail with v5.9-rc1. rpcrdma_convert_kvec() runs off the end of the rl_segments array because rq_rcv_buf.tail[0].iov_len holds a very large positive value. The resultant kernel memory corruption is enough to crash the client system. Callers of rpc_prepare_reply_pages() must reserve an extra XDR_UNIT in the maximum decode size for a possible XDR pad of the contents of the xdr_buf's pages. That guarantees the allocated receive buffer will be large enough to accommodate the usual contents plus that XDR pad word. encode_op_hdr() cannot add that extra word. If it does, xdr_inline_pages() underruns the length of the tail iovec. Fixes: 3e1f02123fba ("NFSv4.2: add client side XDR handling for extended attributes") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/nfs42xdr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 0dc31ad2362e..6e060a88f98c 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -196,7 +196,7 @@ 1 + nfs4_xattr_name_maxsz + 1) #define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) #define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1) -#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1) +#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + 1) #define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \ nfs4_xattr_name_maxsz) #define decode_removexattr_maxsz (op_decode_hdr_maxsz + \ @@ -531,7 +531,7 @@ static void encode_listxattrs(struct xdr_stream *xdr, { __be32 *p; - encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz + 1, hdr); + encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz, hdr); p = reserve_space(xdr, 12); if (unlikely(!p)) -- cgit v1.2.3 From 83f2c45e63935a325f73bde98b1609e0976a12e0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 30 Oct 2020 17:57:29 -0400 Subject: NFS: Remove unnecessary inode locking in nfs_llseek_dir() Remove the contentious inode lock, and instead provide thread safety using the file->f_lock spinlock. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index cb52db9a0cfb..e56b1bd99537 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -955,7 +955,6 @@ out: static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { - struct inode *inode = file_inode(filp); struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", @@ -967,15 +966,15 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) case SEEK_SET: if (offset < 0) return -EINVAL; - inode_lock(inode); + spin_lock(&filp->f_lock); break; case SEEK_CUR: if (offset == 0) return filp->f_pos; - inode_lock(inode); + spin_lock(&filp->f_lock); offset += filp->f_pos; if (offset < 0) { - inode_unlock(inode); + spin_unlock(&filp->f_lock); return -EINVAL; } } @@ -987,7 +986,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dir_ctx->dir_cookie = 0; dir_ctx->duped = 0; } - inode_unlock(inode); + spin_unlock(&filp->f_lock); return offset; } -- cgit v1.2.3 From 11decaf8127b035242cb55de2fc6946f8961f671 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 30 Oct 2020 17:57:30 -0400 Subject: NFS: Remove unnecessary inode lock in nfs_fsync_dir() nfs_inc_stats() is already thread-safe, and there are no other reasons to hold the inode lock here. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e56b1bd99537..4e011adaf967 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -997,13 +997,9 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *inode = file_inode(filp); - dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); - inode_lock(inode); - nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - inode_unlock(inode); + nfs_inc_stats(file_inode(filp), NFSIOS_VFSFSYNC); return 0; } -- cgit v1.2.3 From 9e2b7fa2df4365e99934901da4fb4af52d81e820 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Fri, 6 Nov 2020 08:30:30 +0100 Subject: vrf: Fix fast path output packet handling with async Netfilter rules VRF devices use an optimized direct path on output if a default qdisc is involved, calling Netfilter hooks directly. This path, however, does not consider Netfilter rules completing asynchronously, such as with NFQUEUE. The Netfilter okfn() is called for asynchronously accepted packets, but the VRF never passes that packet down the stack to send it out over the slave device. Using the slower redirect path for this seems not feasible, as we do not know beforehand if a Netfilter hook has asynchronously completing rules. Fix the use of asynchronously completing Netfilter rules in OUTPUT and POSTROUTING by using a special completion function that additionally calls dst_output() to pass the packet down the stack. Also, slightly adjust the use of nf_reset_ct() so that is called in the asynchronous case, too. Fixes: dcdd43c41e60 ("net: vrf: performance improvements for IPv4") Fixes: a9ec54d1b0cd ("net: vrf: performance improvements for IPv6") Signed-off-by: Martin Willi Link: https://lore.kernel.org/r/20201106073030.3974927-1-martin@strongswan.org Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 92 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 60c1aadece89..f2793ffde191 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -608,8 +608,7 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) return ret; } -static int vrf_finish_direct(struct net *net, struct sock *sk, - struct sk_buff *skb) +static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; @@ -628,7 +627,8 @@ static int vrf_finish_direct(struct net *net, struct sock *sk, skb_pull(skb, ETH_HLEN); } - return 1; + /* reset skb device */ + nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) @@ -707,15 +707,41 @@ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, return skb; } +static int vrf_output6_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + vrf_finish_direct(skb); + + return vrf_ip6_local_out(net, sk, skb); +} + static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { + int err = 1; + skb->protocol = htons(ETH_P_IPV6); - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb->dev, - vrf_finish_direct, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + if (!(IPCB(skb)->flags & IPSKB_REROUTED)) + err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, + NULL, skb->dev, vrf_output6_direct_finish); + + if (likely(err == 1)) + vrf_finish_direct(skb); + + return err; +} + +static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int err; + + err = vrf_output6_direct(net, sk, skb); + if (likely(err == 1)) + err = vrf_ip6_local_out(net, sk, skb); + + return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, @@ -728,18 +754,15 @@ static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, - skb, NULL, vrf_dev, vrf_output6_direct); + skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); - /* reset skb device */ if (likely(err == 1)) - nf_reset_ct(skb); - else - skb = NULL; + return skb; - return skb; + return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, @@ -919,15 +942,41 @@ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, return skb; } +static int vrf_output_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + vrf_finish_direct(skb); + + return vrf_ip_local_out(net, sk, skb); +} + static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { + int err = 1; + skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb->dev, - vrf_finish_direct, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + if (!(IPCB(skb)->flags & IPSKB_REROUTED)) + err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, + NULL, skb->dev, vrf_output_direct_finish); + + if (likely(err == 1)) + vrf_finish_direct(skb); + + return err; +} + +static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int err; + + err = vrf_output_direct(net, sk, skb); + if (likely(err == 1)) + err = vrf_ip_local_out(net, sk, skb); + + return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, @@ -940,18 +989,15 @@ static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, - skb, NULL, vrf_dev, vrf_output_direct); + skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); - /* reset skb device */ if (likely(err == 1)) - nf_reset_ct(skb); - else - skb = NULL; + return skb; - return skb; + return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, -- cgit v1.2.3 From 9f73bd1c2c4c304b238051fc92b3f807326f0a89 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 11 Nov 2020 05:47:44 +0200 Subject: devlink: Avoid overwriting port attributes of registered port Cited commit in fixes tag overwrites the port attributes for the registered port. Avoid such error by checking registered flag before setting attributes. Fixes: 71ad8d55f8e5 ("devlink: Replace devlink_port_attrs_set parameters with a struct") Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20201111034744.35554-1-parav@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index a932d95be798..ab4b1368904f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8254,8 +8254,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, { struct devlink_port_attrs *attrs = &devlink_port->attrs; - if (WARN_ON(devlink_port->registered)) - return -EEXIST; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { @@ -8279,6 +8277,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; + if (WARN_ON(devlink_port->registered)) + return; devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) @@ -8301,6 +8301,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; + if (WARN_ON(devlink_port->registered)) + return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) @@ -8326,6 +8328,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; + if (WARN_ON(devlink_port->registered)) + return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) -- cgit v1.2.3 From 4b1a86281cc1d0de46df3ad2cb8c1f86ac07681c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Nov 2020 20:45:25 +0000 Subject: net: udp: fix UDP header access on Fast/frag0 UDP GRO UDP GRO uses udp_hdr(skb) in its .gro_receive() callback. While it's probably OK for non-frag0 paths (when all headers or even the entire frame are already in skb head), this inline points to junk when using Fast GRO (napi_gro_frags() or napi_gro_receive() with only Ethernet header in skb head and all the rest in the frags) and breaks GRO packet compilation and the packet flow itself. To support both modes, skb_gro_header_fast() + skb_gro_header_slow() are typically used. UDP even has an inline helper that makes use of them, udp_gro_udphdr(). Use that instead of troublemaking udp_hdr() to get rid of the out-of-order delivers. Present since the introduction of plain UDP GRO in 5.0-rc1. Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.") Cc: Eric Dumazet Signed-off-by: Alexander Lobakin Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- net/ipv4/udp_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e67a66fbf27b..13740e9fe6ec 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -366,7 +366,7 @@ out: static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *skb) { - struct udphdr *uh = udp_hdr(skb); + struct udphdr *uh = udp_gro_udphdr(skb); struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p; -- cgit v1.2.3 From 55e729889bb07d68ab071660ce3f5e7a7872ebe8 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Nov 2020 20:45:38 +0000 Subject: net: udp: fix IP header access and skb lookup on Fast/frag0 UDP GRO udp{4,6}_lib_lookup_skb() use ip{,v6}_hdr() to get IP header of the packet. While it's probably OK for non-frag0 paths, this helpers will also point to junk on Fast/frag0 GRO when all headers are located in frags. As a result, sk/skb lookup may fail or give wrong results. To support both GRO modes, skb_gro_network_header() might be used. To not modify original functions, add private versions of udp{4,6}_lib_lookup_skb() only to perform correct sk lookups on GRO. Present since the introduction of "application-level" UDP GRO in 4.7-rc1. Misc: replace totally unneeded ternaries with plain ifs. Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket") Suggested-by: Willem de Bruijn Cc: Eric Dumazet Signed-off-by: Alexander Lobakin Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- net/ipv4/udp_offload.c | 17 +++++++++++++++-- net/ipv6/udp_offload.c | 17 +++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 13740e9fe6ec..c62805cd3131 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -500,12 +500,22 @@ out: } EXPORT_SYMBOL(udp_gro_receive); +static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, + __be16 dport) +{ + const struct iphdr *iph = skb_gro_network_header(skb); + + return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + inet_sdif(skb), &udp_table, NULL); +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sock *sk = NULL; struct sk_buff *pp; - struct sock *sk; if (unlikely(!uh)) goto flush; @@ -523,7 +533,10 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; rcu_read_lock(); - sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + + if (static_branch_unlikely(&udp_encap_needed_key)) + sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); + pp = udp_gro_receive(head, skb, uh, sk); rcu_read_unlock(); return pp; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 584157a07759..f9e888d1b9af 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -111,12 +111,22 @@ out: return segs; } +static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, + __be16 dport) +{ + const struct ipv6hdr *iph = skb_gro_network_header(skb); + + return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), + inet6_sdif(skb), &udp_table, NULL); +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sock *sk = NULL; struct sk_buff *pp; - struct sock *sk; if (unlikely(!uh)) goto flush; @@ -135,7 +145,10 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) skip: NAPI_GRO_CB(skb)->is_ipv6 = 1; rcu_read_lock(); - sk = static_branch_unlikely(&udpv6_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + + if (static_branch_unlikely(&udpv6_encap_needed_key)) + sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest); + pp = udp_gro_receive(head, skb, uh, sk); rcu_read_unlock(); return pp; -- cgit v1.2.3 From edbc21113bde13ca3d06eec24b621b1f628583dd Mon Sep 17 00:00:00 2001 From: Sven Van Asbroeck Date: Thu, 12 Nov 2020 10:25:13 -0500 Subject: lan743x: fix use of uninitialized variable When no devicetree is present, the driver will use an uninitialized variable. Fix by initializing this variable. Fixes: 902a66e08cea ("lan743x: correctly handle chips with internal PHY") Reported-by: kernel test robot Signed-off-by: Sven Van Asbroeck Link: https://lore.kernel.org/r/20201112152513.1941-1-TheSven73@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 173158656559..e2c99d909247 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1014,8 +1014,8 @@ static void lan743x_phy_close(struct lan743x_adapter *adapter) static int lan743x_phy_open(struct lan743x_adapter *adapter) { struct lan743x_phy *phy = &adapter->phy; + struct phy_device *phydev = NULL; struct device_node *phynode; - struct phy_device *phydev; struct net_device *netdev; int ret = -EIO; -- cgit v1.2.3