diff options
author | Arnaldo Carvalho de Melo <acme@redhat.com> | 2020-12-17 18:37:24 +0100 |
---|---|---|
committer | Arnaldo Carvalho de Melo <acme@redhat.com> | 2020-12-17 18:37:24 +0100 |
commit | 281a94b0f2f0775a2b7825c18bccf7e4c922b7b3 (patch) | |
tree | 9c6b8a2d392a1640d84989de25e079f3758c4f1a /tools | |
parent | perf tools: Reformat record's control fd man text (diff) | |
parent | Merge tag 'arm-soc-omap-genpd-5.11' of git://git.kernel.org/pub/scm/linux/ker... (diff) | |
download | linux-281a94b0f2f0775a2b7825c18bccf7e4c922b7b3.tar.xz linux-281a94b0f2f0775a2b7825c18bccf7e4c922b7b3.zip |
Merge remote-tracking branch 'torvalds/master' into perf/core
To pick up fixes and check what UAPI headers need to be synched.
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools')
294 files changed, 17688 insertions, 3760 deletions
diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 568854b14d0a..52c6262e6bfd 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -201,6 +201,21 @@ static inline int insn_offset_immediate(struct insn *insn) return insn_offset_displacement(insn) + insn->displacement.nbytes; } +/** + * for_each_insn_prefix() -- Iterate prefixes in the instruction + * @insn: Pointer to struct insn. + * @idx: Index storage. + * @prefix: Prefix byte. + * + * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix + * and the index is stored in @idx (note that this @idx is just for a cursor, + * do not change it.) + * Since prefixes.nbytes can be bigger than 4 if some prefixes + * are repeated, it cannot be used for looping over the prefixes. + */ +#define for_each_insn_prefix(insn, idx, prefix) \ + for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) + #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index eb92027817a7..7362bef1a368 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -10,6 +10,7 @@ #include <unistd.h> #include <string.h> #include <errno.h> +#include <endian.h> #include <linux/kernel.h> #include <linux/bootconfig.h> @@ -147,6 +148,12 @@ static int load_xbc_file(const char *path, char **buf) return ret; } +static int pr_errno(const char *msg, int err) +{ + pr_err("%s: %d\n", msg, err); + return err; +} + static int load_xbc_from_initrd(int fd, char **buf) { struct stat stat; @@ -162,26 +169,26 @@ static int load_xbc_from_initrd(int fd, char **buf) if (stat.st_size < 8 + BOOTCONFIG_MAGIC_LEN) return 0; - if (lseek(fd, -BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) { - pr_err("Failed to lseek: %d\n", -errno); - return -errno; - } + if (lseek(fd, -BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) + return pr_errno("Failed to lseek for magic", -errno); + if (read(fd, magic, BOOTCONFIG_MAGIC_LEN) < 0) - return -errno; + return pr_errno("Failed to read", -errno); + /* Check the bootconfig magic bytes */ if (memcmp(magic, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN) != 0) return 0; - if (lseek(fd, -(8 + BOOTCONFIG_MAGIC_LEN), SEEK_END) < 0) { - pr_err("Failed to lseek: %d\n", -errno); - return -errno; - } + if (lseek(fd, -(8 + BOOTCONFIG_MAGIC_LEN), SEEK_END) < 0) + return pr_errno("Failed to lseek for size", -errno); if (read(fd, &size, sizeof(u32)) < 0) - return -errno; + return pr_errno("Failed to read size", -errno); + size = le32toh(size); if (read(fd, &csum, sizeof(u32)) < 0) - return -errno; + return pr_errno("Failed to read checksum", -errno); + csum = le32toh(csum); /* Wrong size error */ if (stat.st_size < size + 8 + BOOTCONFIG_MAGIC_LEN) { @@ -190,10 +197,8 @@ static int load_xbc_from_initrd(int fd, char **buf) } if (lseek(fd, stat.st_size - (size + 8 + BOOTCONFIG_MAGIC_LEN), - SEEK_SET) < 0) { - pr_err("Failed to lseek: %d\n", -errno); - return -errno; - } + SEEK_SET) < 0) + return pr_errno("Failed to lseek", -errno); ret = load_xbc_fd(fd, buf, size); if (ret < 0) @@ -262,14 +267,16 @@ static int show_xbc(const char *path, bool list) ret = stat(path, &st); if (ret < 0) { - pr_err("Failed to stat %s: %d\n", path, -errno); - return -errno; + ret = -errno; + pr_err("Failed to stat %s: %d\n", path, ret); + return ret; } fd = open(path, O_RDONLY); if (fd < 0) { - pr_err("Failed to open initrd %s: %d\n", path, fd); - return -errno; + ret = -errno; + pr_err("Failed to open initrd %s: %d\n", path, ret); + return ret; } ret = load_xbc_from_initrd(fd, &buf); @@ -307,8 +314,9 @@ static int delete_xbc(const char *path) fd = open(path, O_RDWR); if (fd < 0) { - pr_err("Failed to open initrd %s: %d\n", path, fd); - return -errno; + ret = -errno; + pr_err("Failed to open initrd %s: %d\n", path, ret); + return ret; } size = load_xbc_from_initrd(fd, &buf); @@ -332,11 +340,13 @@ static int delete_xbc(const char *path) static int apply_xbc(const char *path, const char *xbc_path) { + char *buf, *data, *p; + size_t total_size; + struct stat stat; + const char *msg; u32 size, csum; - char *buf, *data; + int pos, pad; int ret, fd; - const char *msg; - int pos; ret = load_xbc_file(xbc_path, &buf); if (ret < 0) { @@ -346,13 +356,12 @@ static int apply_xbc(const char *path, const char *xbc_path) size = strlen(buf) + 1; csum = checksum((unsigned char *)buf, size); - /* Prepare xbc_path data */ - data = malloc(size + 8); + /* Backup the bootconfig data */ + data = calloc(size + BOOTCONFIG_ALIGN + + sizeof(u32) + sizeof(u32) + BOOTCONFIG_MAGIC_LEN, 1); if (!data) return -ENOMEM; - strcpy(data, buf); - *(u32 *)(data + size) = size; - *(u32 *)(data + size + 4) = csum; + memcpy(data, buf, size); /* Check the data format */ ret = xbc_init(buf, &msg, &pos); @@ -383,28 +392,61 @@ static int apply_xbc(const char *path, const char *xbc_path) /* Apply new one */ fd = open(path, O_RDWR | O_APPEND); if (fd < 0) { - pr_err("Failed to open %s: %d\n", path, fd); + ret = -errno; + pr_err("Failed to open %s: %d\n", path, ret); free(data); - return fd; + return ret; } /* TODO: Ensure the @path is initramfs/initrd image */ - ret = write(fd, data, size + 8); - if (ret < 0) { - pr_err("Failed to apply a boot config: %d\n", ret); + if (fstat(fd, &stat) < 0) { + pr_err("Failed to get the size of %s\n", path); goto out; } - /* Write a magic word of the bootconfig */ - ret = write(fd, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN); - if (ret < 0) { - pr_err("Failed to apply a boot config magic: %d\n", ret); - goto out; - } - ret = 0; + + /* To align up the total size to BOOTCONFIG_ALIGN, get padding size */ + total_size = stat.st_size + size + sizeof(u32) * 2 + BOOTCONFIG_MAGIC_LEN; + pad = ((total_size + BOOTCONFIG_ALIGN - 1) & (~BOOTCONFIG_ALIGN_MASK)) - total_size; + size += pad; + + /* Add a footer */ + p = data + size; + *(u32 *)p = htole32(size); + p += sizeof(u32); + + *(u32 *)p = htole32(csum); + p += sizeof(u32); + + memcpy(p, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN); + p += BOOTCONFIG_MAGIC_LEN; + + total_size = p - data; + + ret = write(fd, data, total_size); + if (ret < total_size) { + if (ret < 0) + ret = -errno; + pr_err("Failed to apply a boot config: %d\n", ret); + if (ret >= 0) + goto out_rollback; + } else + ret = 0; + out: close(fd); free(data); return ret; + +out_rollback: + /* Map the partial write to -ENOSPC */ + if (ret >= 0) + ret = -ENOSPC; + if (ftruncate(fd, stat.st_size) < 0) { + ret = -errno; + pr_err("Failed to rollback the write error: %d\n", ret); + pr_err("The initrd %s may be corrupted. Recommend to rebuild.\n", path); + } + goto out; } static int usage(void) diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh index d295e406a756..baed891d0ba4 100755 --- a/tools/bootconfig/test-bootconfig.sh +++ b/tools/bootconfig/test-bootconfig.sh @@ -9,6 +9,7 @@ else TESTDIR=. fi BOOTCONF=${TESTDIR}/bootconfig +ALIGN=4 INITRD=`mktemp ${TESTDIR}/initrd-XXXX` TEMPCONF=`mktemp ${TESTDIR}/temp-XXXX.bconf` @@ -59,7 +60,10 @@ echo "Show command test" xpass $BOOTCONF $INITRD echo "File size check" -xpass test $new_size -eq $(expr $bconf_size + $initrd_size + 9 + 12) +total_size=$(expr $bconf_size + $initrd_size + 9 + 12 + $ALIGN - 1 ) +total_size=$(expr $total_size / $ALIGN) +total_size=$(expr $total_size \* $ALIGN) +xpass test $new_size -eq $total_size echo "Apply command repeat test" xpass $BOOTCONF -a $TEMPCONF $INITRD diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore index 3e601bcfd461..944cb4b7c95d 100644 --- a/tools/bpf/bpftool/.gitignore +++ b/tools/bpf/bpftool/.gitignore @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only *.d -/bpftool-bootstrap +/bootstrap/ /bpftool bpftool*.8 bpf-helpers.* diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index dade10cdf295..3d52256ba75f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -50,7 +50,8 @@ MAP COMMANDS | | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** } +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** + | **task_storage** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index f60e6ad3a1df..f897cb5fb12d 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -19,22 +19,39 @@ BPF_DIR = $(srctree)/tools/lib/bpf/ ifneq ($(OUTPUT),) LIBBPF_OUTPUT = $(OUTPUT)/libbpf/ LIBBPF_PATH = $(LIBBPF_OUTPUT) + BOOTSTRAP_OUTPUT = $(OUTPUT)/bootstrap/ else + LIBBPF_OUTPUT = LIBBPF_PATH = $(BPF_DIR) + BOOTSTRAP_OUTPUT = $(CURDIR)/bootstrap/ endif LIBBPF = $(LIBBPF_PATH)libbpf.a +LIBBPF_BOOTSTRAP_OUTPUT = $(BOOTSTRAP_OUTPUT)libbpf/ +LIBBPF_BOOTSTRAP = $(LIBBPF_BOOTSTRAP_OUTPUT)libbpf.a -BPFTOOL_VERSION ?= $(shell make -rR --no-print-directory -sC ../../.. kernelversion) +ifeq ($(BPFTOOL_VERSION),) +BPFTOOL_VERSION := $(shell make -rR --no-print-directory -sC ../../.. kernelversion) +endif + +$(LIBBPF_OUTPUT) $(BOOTSTRAP_OUTPUT) $(LIBBPF_BOOTSTRAP_OUTPUT): + $(QUIET_MKDIR)mkdir -p $@ -$(LIBBPF): FORCE - $(if $(LIBBPF_OUTPUT),@mkdir -p $(LIBBPF_OUTPUT)) +$(LIBBPF): FORCE | $(LIBBPF_OUTPUT) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) $(LIBBPF_OUTPUT)libbpf.a -$(LIBBPF)-clean: +$(LIBBPF_BOOTSTRAP): FORCE | $(LIBBPF_BOOTSTRAP_OUTPUT) + $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) \ + ARCH= CC=$(HOSTCC) LD=$(HOSTLD) $@ + +$(LIBBPF)-clean: FORCE | $(LIBBPF_OUTPUT) $(call QUIET_CLEAN, libbpf) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) clean >/dev/null +$(LIBBPF_BOOTSTRAP)-clean: FORCE | $(LIBBPF_BOOTSTRAP_OUTPUT) + $(call QUIET_CLEAN, libbpf-bootstrap) + $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) clean >/dev/null + prefix ?= /usr/local bash_compdir ?= /usr/share/bash-completion/completions @@ -92,6 +109,7 @@ CFLAGS += -DCOMPAT_NEED_REALLOCARRAY endif LIBS = $(LIBBPF) -lelf -lz +LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz ifeq ($(feature-libcap), 1) CFLAGS += -DUSE_LIBCAP LIBS += -lcap @@ -118,9 +136,9 @@ CFLAGS += -DHAVE_LIBBFD_SUPPORT SRCS += $(BFD_SRCS) endif -BPFTOOL_BOOTSTRAP := $(if $(OUTPUT),$(OUTPUT)bpftool-bootstrap,./bpftool-bootstrap) +BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool -BOOTSTRAP_OBJS = $(addprefix $(OUTPUT),main.o common.o json_writer.o gen.o btf.o) +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ @@ -167,12 +185,16 @@ $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(OUTPUT)feature.o: | zdep -$(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF) - $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(BOOTSTRAP_OBJS) $(LIBS) +$(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) + $(QUIET_LINK)$(HOSTCC) $(CFLAGS) $(LDFLAGS) -o $@ $(BOOTSTRAP_OBJS) \ + $(LIBS_BOOTSTRAP) $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJS) $(LIBS) +$(BOOTSTRAP_OUTPUT)%.o: %.c | $(BOOTSTRAP_OUTPUT) + $(QUIET_CC)$(HOSTCC) $(CFLAGS) -c -MMD -o $@ $< + $(OUTPUT)%.o: %.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $< @@ -180,11 +202,11 @@ feature-detect-clean: $(call QUIET_CLEAN, feature-detect) $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null -clean: $(LIBBPF)-clean feature-detect-clean +clean: $(LIBBPF)-clean $(LIBBPF_BOOTSTRAP)-clean feature-detect-clean $(call QUIET_CLEAN, bpftool) $(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d - $(Q)$(RM) -- $(BPFTOOL_BOOTSTRAP) $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h - $(Q)$(RM) -r -- $(OUTPUT)libbpf/ + $(Q)$(RM) -- $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h + $(Q)$(RM) -r -- $(LIBBPF_OUTPUT) $(BOOTSTRAP_OUTPUT) $(call QUIET_CLEAN, core-gen) $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool $(Q)$(RM) -r -- $(OUTPUT)feature/ diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 3f1da30c4da6..fdffbc64c65c 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -705,7 +705,7 @@ _bpftool() hash_of_maps devmap devmap_hash sockmap cpumap \ xskmap sockhash cgroup_storage reuseport_sockarray \ percpu_cgroup_storage queue stack sk_storage \ - struct_ops inode_storage' -- \ + struct_ops inode_storage task_storage' -- \ "$cur" ) ) return 0 ;; diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 8ab142ff5eac..fe9e7b3a4b50 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -357,9 +357,15 @@ static int dump_btf_raw(const struct btf *btf, dump_btf_type(btf, root_type_ids[i], t); } } else { + const struct btf *base; int cnt = btf__get_nr_types(btf); + int start_id = 1; - for (i = 1; i <= cnt; i++) { + base = btf__base_btf(btf); + if (base) + start_id = btf__get_nr_types(base) + 1; + + for (i = start_id; i <= cnt; i++) { t = btf__type_by_id(btf, i); dump_btf_type(btf, i, t); } @@ -424,7 +430,7 @@ done: static int do_dump(int argc, char **argv) { - struct btf *btf = NULL; + struct btf *btf = NULL, *base = NULL; __u32 root_type_ids[2]; int root_type_cnt = 0; bool dump_c = false; @@ -438,7 +444,6 @@ static int do_dump(int argc, char **argv) return -1; } src = GET_ARG(); - if (is_prefix(src, "map")) { struct bpf_map_info info = {}; __u32 len = sizeof(info); @@ -499,7 +504,21 @@ static int do_dump(int argc, char **argv) } NEXT_ARG(); } else if (is_prefix(src, "file")) { - btf = btf__parse(*argv, NULL); + const char sysfs_prefix[] = "/sys/kernel/btf/"; + const char sysfs_vmlinux[] = "/sys/kernel/btf/vmlinux"; + + if (!base_btf && + strncmp(*argv, sysfs_prefix, sizeof(sysfs_prefix) - 1) == 0 && + strcmp(*argv, sysfs_vmlinux) != 0) { + base = btf__parse(sysfs_vmlinux, NULL); + if (libbpf_get_error(base)) { + p_err("failed to parse vmlinux BTF at '%s': %ld\n", + sysfs_vmlinux, libbpf_get_error(base)); + base = NULL; + } + } + + btf = btf__parse_split(*argv, base ?: base_btf); if (IS_ERR(btf)) { err = -PTR_ERR(btf); btf = NULL; @@ -564,6 +583,7 @@ static int do_dump(int argc, char **argv) done: close(fd); btf__free(btf); + btf__free(base); return err; } @@ -693,6 +713,7 @@ build_btf_type_table(struct btf_attach_table *tab, enum bpf_obj_type type, obj_node = calloc(1, sizeof(*obj_node)); if (!obj_node) { p_err("failed to allocate memory: %s", strerror(errno)); + err = -ENOMEM; goto err_free; } @@ -739,9 +760,16 @@ show_btf_plain(struct bpf_btf_info *info, int fd, struct btf_attach_table *btf_map_table) { struct btf_attach_point *obj; + const char *name = u64_to_ptr(info->name); int n; printf("%u: ", info->id); + if (info->kernel_btf) + printf("name [%s] ", name); + else if (name && name[0]) + printf("name %s ", name); + else + printf("name <anon> "); printf("size %uB", info->btf_size); n = 0; @@ -768,6 +796,7 @@ show_btf_json(struct bpf_btf_info *info, int fd, struct btf_attach_table *btf_map_table) { struct btf_attach_point *obj; + const char *name = u64_to_ptr(info->name); jsonw_start_object(json_wtr); /* btf object */ jsonw_uint_field(json_wtr, "id", info->id); @@ -793,6 +822,11 @@ show_btf_json(struct bpf_btf_info *info, int fd, emit_obj_refs_json(&refs_table, info->id, json_wtr); /* pids */ + jsonw_bool_field(json_wtr, "kernel", info->kernel_btf); + + if (name && name[0]) + jsonw_string_field(json_wtr, "name", name); + jsonw_end_object(json_wtr); /* btf object */ } @@ -800,15 +834,30 @@ static int show_btf(int fd, struct btf_attach_table *btf_prog_table, struct btf_attach_table *btf_map_table) { - struct bpf_btf_info info = {}; + struct bpf_btf_info info; __u32 len = sizeof(info); + char name[64]; int err; + memset(&info, 0, sizeof(info)); err = bpf_obj_get_info_by_fd(fd, &info, &len); if (err) { p_err("can't get BTF object info: %s", strerror(errno)); return -1; } + /* if kernel support emitting BTF object name, pass name pointer */ + if (info.name_len) { + memset(&info, 0, sizeof(info)); + info.name_len = sizeof(name); + info.name = ptr_to_u64(name); + len = sizeof(info); + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get BTF object info: %s", strerror(errno)); + return -1; + } + } if (json_output) show_btf_json(&info, fd, btf_prog_table, btf_map_table); diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 682daaa49e6a..b86f450e6fce 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -11,6 +11,7 @@ #include <bpf/bpf.h> #include <bpf/libbpf.h> +#include <bpf/btf.h> #include "main.h" @@ -28,6 +29,7 @@ bool show_pinned; bool block_mount; bool verifier_logs; bool relaxed_maps; +struct btf *base_btf; struct pinned_obj_table prog_table; struct pinned_obj_table map_table; struct pinned_obj_table link_table; @@ -391,6 +393,7 @@ int main(int argc, char **argv) { "mapcompat", no_argument, NULL, 'm' }, { "nomount", no_argument, NULL, 'n' }, { "debug", no_argument, NULL, 'd' }, + { "base-btf", required_argument, NULL, 'B' }, { 0 } }; int opt, ret; @@ -407,7 +410,7 @@ int main(int argc, char **argv) hash_init(link_table.table); opterr = 0; - while ((opt = getopt_long(argc, argv, "Vhpjfmnd", + while ((opt = getopt_long(argc, argv, "VhpjfmndB:", options, NULL)) >= 0) { switch (opt) { case 'V': @@ -441,6 +444,15 @@ int main(int argc, char **argv) libbpf_set_print(print_all_levels); verifier_logs = true; break; + case 'B': + base_btf = btf__parse(optarg, NULL); + if (libbpf_get_error(base_btf)) { + p_err("failed to parse base BTF at '%s': %ld\n", + optarg, libbpf_get_error(base_btf)); + base_btf = NULL; + return -1; + } + break; default: p_err("unrecognized option '%s'", argv[optind - 1]); if (json_output) @@ -465,6 +477,7 @@ int main(int argc, char **argv) delete_pinned_obj_table(&map_table); delete_pinned_obj_table(&link_table); } + btf__free(base_btf); return ret; } diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index c46e52137b87..76e91641262b 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -90,6 +90,7 @@ extern bool show_pids; extern bool block_mount; extern bool verifier_logs; extern bool relaxed_maps; +extern struct btf *base_btf; extern struct pinned_obj_table prog_table; extern struct pinned_obj_table map_table; extern struct pinned_obj_table link_table; diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index a7efbd84fbcc..b400364ee054 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -51,6 +51,7 @@ const char * const map_type_name[] = { [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", [BPF_MAP_TYPE_RINGBUF] = "ringbuf", [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", + [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", }; const size_t map_type_name_size = ARRAY_SIZE(map_type_name); @@ -1464,7 +1465,8 @@ static int do_help(int argc, char **argv) " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" - " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage }\n" + " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" + " task_storage }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c index df7d8ec76036..477e55d59c34 100644 --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -89,9 +89,9 @@ libbpf_print_none(__maybe_unused enum libbpf_print_level level, int build_obj_refs_table(struct obj_refs_table *table, enum bpf_obj_type type) { - char buf[4096]; - struct pid_iter_bpf *skel; struct pid_iter_entry *e; + char buf[4096 / sizeof(*e) * sizeof(*e)]; + struct pid_iter_bpf *skel; int err, ret, fd = -1, i; libbpf_print_fn_t default_print; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index acdb2c245f0a..1fe3ba255bad 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1717,6 +1717,34 @@ struct profile_metric { .ratio_desc = "LLC misses per million insns", .ratio_mul = 1e6, }, + { + .name = "itlb_misses", + .attr = { + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_ITLB | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + .exclude_user = 1 + }, + .ratio_metric = 2, + .ratio_desc = "itlb misses per million insns", + .ratio_mul = 1e6, + }, + { + .name = "dtlb_misses", + .attr = { + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_DTLB | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + .exclude_user = 1 + }, + .ratio_metric = 2, + .ratio_desc = "dtlb misses per million insns", + .ratio_mul = 1e6, + }, }; static __u64 profile_total_count; @@ -2109,7 +2137,7 @@ static int do_help(int argc, char **argv) " struct_ops | fentry | fexit | freplace | sk_lookup }\n" " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" " flow_dissector }\n" - " METRIC := { cycles | instructions | l1d_loads | llc_misses }\n" + " METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index 66cb92136de4..bf656432ad73 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -18,15 +18,6 @@ else endif # always use the host compiler -ifneq ($(LLVM),) -HOSTAR ?= llvm-ar -HOSTCC ?= clang -HOSTLD ?= ld.lld -else -HOSTAR ?= ar -HOSTCC ?= gcc -HOSTLD ?= ld -endif AR = $(HOSTAR) CC = $(HOSTCC) LD = $(HOSTLD) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index dfa540d8a02d..e3ea569ee125 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -454,7 +454,7 @@ static int symbols_collect(struct object *obj) return -ENOMEM; if (id->addr_cnt >= ADDR_CNT) { - pr_err("FAILED symbol %s crossed the number of allowed lists", + pr_err("FAILED symbol %s crossed the number of allowed lists\n", id->name); return -1; } @@ -477,8 +477,8 @@ static int symbols_resolve(struct object *obj) btf = btf__parse(obj->btf ?: obj->path, NULL); err = libbpf_get_error(btf); if (err) { - pr_err("FAILED: load BTF from %s: %s", - obj->path, strerror(err)); + pr_err("FAILED: load BTF from %s: %s\n", + obj->path, strerror(-err)); return -1; } diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index fb1337d69868..4d5ca54fcd4c 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -1,13 +1,18 @@ # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) -OUTPUT := .output +include ../../scripts/Makefile.include + +OUTPUT ?= $(abspath .output)/ + CLANG ?= clang LLC ?= llc LLVM_STRIP ?= llvm-strip -DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool +BPFTOOL_OUTPUT := $(OUTPUT)bpftool/ +DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)bpftool BPFTOOL ?= $(DEFAULT_BPFTOOL) LIBBPF_SRC := $(abspath ../../lib/bpf) -BPFOBJ := $(OUTPUT)/libbpf.a -BPF_INCLUDE := $(OUTPUT) +BPFOBJ_OUTPUT := $(OUTPUT)libbpf/ +BPFOBJ := $(BPFOBJ_OUTPUT)libbpf.a +BPF_INCLUDE := $(BPFOBJ_OUTPUT) INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../lib) \ -I$(abspath ../../include/uapi) CFLAGS := -g -Wall @@ -18,13 +23,10 @@ VMLINUX_BTF_PATHS := /sys/kernel/btf/vmlinux /boot/vmlinux-$(KERNEL_REL) VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ $(wildcard $(VMLINUX_BTF_PATHS)))) -abs_out := $(abspath $(OUTPUT)) ifeq ($(V),1) Q = -msg = else Q = @ -msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; MAKEFLAGS += --no-print-directory submake_extras := feature_display=0 endif @@ -37,12 +39,15 @@ all: runqslower runqslower: $(OUTPUT)/runqslower clean: - $(call msg,CLEAN) - $(Q)rm -rf $(OUTPUT) runqslower + $(call QUIET_CLEAN, runqslower) + $(Q)$(RM) -r $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT) + $(Q)$(RM) $(OUTPUT)*.o $(OUTPUT)*.d + $(Q)$(RM) $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h + $(Q)$(RM) $(OUTPUT)runqslower + $(Q)$(RM) -r .output $(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ) - $(call msg,BINARY,$@) - $(Q)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ + $(QUIET_LINK)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ $(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ $(OUTPUT)/runqslower.bpf.o @@ -50,36 +55,30 @@ $(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ $(OUTPUT)/runqslower.bpf.o: $(OUTPUT)/vmlinux.h runqslower.h $(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(BPFTOOL) - $(call msg,GEN-SKEL,$@) - $(Q)$(BPFTOOL) gen skeleton $< > $@ + $(QUIET_GEN)$(BPFTOOL) gen skeleton $< > $@ $(OUTPUT)/%.bpf.o: %.bpf.c $(BPFOBJ) | $(OUTPUT) - $(call msg,BPF,$@) - $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \ + $(QUIET_GEN)$(CLANG) -g -O2 -target bpf $(INCLUDES) \ -c $(filter %.c,$^) -o $@ && \ $(LLVM_STRIP) -g $@ $(OUTPUT)/%.o: %.c | $(OUTPUT) - $(call msg,CC,$@) - $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ -$(OUTPUT): - $(call msg,MKDIR,$@) - $(Q)mkdir -p $(OUTPUT) +$(OUTPUT) $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT): + $(QUIET_MKDIR)mkdir -p $@ $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) - $(call msg,GEN,$@) $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \ echo "Couldn't find kernel BTF; set VMLINUX_BTF to" \ "specify its location." >&2; \ exit 1;\ fi - $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ + $(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ -$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \ - OUTPUT=$(abspath $(dir $@))/ $(abspath $@) +$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) $@ -$(DEFAULT_BPFTOOL): - $(Q)$(MAKE) $(submake_extras) -C ../bpftool \ - prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install +$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT) \ + CC=$(HOSTCC) LD=$(HOSTLD) diff --git a/tools/build/Makefile b/tools/build/Makefile index 722f1700d96a..bae48e6fa995 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -15,10 +15,6 @@ endef $(call allow-override,CC,$(CROSS_COMPILE)gcc) $(call allow-override,LD,$(CROSS_COMPILE)ld) -HOSTCC ?= gcc -HOSTLD ?= ld -HOSTAR ?= ar - export HOSTCC HOSTLD HOSTAR ifeq ($(V),1) diff --git a/tools/debugging/kernel-chktaint b/tools/debugging/kernel-chktaint index 2240cb56e6e5..607b2b280945 100755 --- a/tools/debugging/kernel-chktaint +++ b/tools/debugging/kernel-chktaint @@ -72,7 +72,7 @@ if [ `expr $T % 2` -eq 0 ]; then addout " " else addout "S" - echo " * SMP kernel oops on an officially SMP incapable processor (#2)" + echo " * kernel running on an out of specification system (#2)" fi T=`expr $T / 2` diff --git a/tools/include/linux/poison.h b/tools/include/linux/poison.h index d29725769107..2e6338ac5eed 100644 --- a/tools/include/linux/poison.h +++ b/tools/include/linux/poison.h @@ -35,12 +35,8 @@ */ #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) -/********** mm/debug-pagealloc.c **********/ -#ifdef CONFIG_PAGE_POISONING_ZERO -#define PAGE_POISON 0x00 -#else +/********** mm/page_poison.c **********/ #define PAGE_POISON 0xaa -#endif /********** mm/page_alloc.c ************/ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 2551e9b71167..e61d36cd4e50 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -107,7 +107,7 @@ static int errno; #endif /* errno codes all ensure that they will not conflict with a valid pointer - * because they all correspond to the highest addressable memry page. + * because they all correspond to the highest addressable memory page. */ #define MAX_ERRNO 4095 @@ -231,7 +231,7 @@ struct rusage { #define DT_SOCK 12 /* all the *at functions */ -#ifndef AT_FDWCD +#ifndef AT_FDCWD #define AT_FDCWD -100 #endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e6ceac3f7d62..77d7c1bb2923 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -157,6 +157,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, }; /* Note that tracing related programs such as @@ -556,7 +557,12 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ - __u32 attach_prog_fd; /* 0 to attach to vmlinux */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -3742,6 +3748,88 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be an task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's avaialable). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. + * + * struct socket *bpf_sock_from_file(struct file *file) + * Description + * If the given file represents a socket, returns the associated + * socket. + * Return + * A pointer to a struct socket on success or NULL if the file is + * not a socket. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3897,9 +3985,16 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ - FN(bpf_per_cpu_ptr), \ - FN(bpf_this_cpu_ptr), \ + FN(per_cpu_ptr), \ + FN(this_cpu_ptr), \ FN(redirect_peer), \ + FN(task_storage_get), \ + FN(task_storage_delete), \ + FN(get_current_task_btf), \ + FN(bprm_opts_set), \ + FN(ktime_get_coarse_ns), \ + FN(ima_inode_hash), \ + FN(sock_from_file), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4071,6 +4166,11 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_IP, }; +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + #define __bpf_md_ptr(type, name) \ union { \ type name; \ @@ -4418,6 +4518,9 @@ struct bpf_btf_info { __aligned_u64 btf; __u32 btf_size; __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; } __attribute__((aligned(8))); struct bpf_link_info { diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 781e482dc499..d208b2af697f 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -409,6 +409,8 @@ enum { IFLA_MACVLAN_MACADDR, IFLA_MACVLAN_MACADDR_DATA, IFLA_MACVLAN_MACADDR_COUNT, + IFLA_MACVLAN_BC_QUEUE_LEN, + IFLA_MACVLAN_BC_QUEUE_LEN_USED, __IFLA_MACVLAN_MAX, }; diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index d199a3694be8..b0bf56c5f120 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -742,7 +742,11 @@ class DebugfsProvider(Provider): The fields are all available KVM debugfs files """ - return self.walkdir(PATH_DEBUGFS_KVM)[2] + exempt_list = ['halt_poll_fail_ns', 'halt_poll_success_ns'] + fields = [field for field in self.walkdir(PATH_DEBUGFS_KVM)[2] + if field not in exempt_list] + + return fields def update_fields(self, fields_filter): """Refresh fields, applying fields_filter""" diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index d27e34133973..bba48ff4c5c0 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -67,11 +67,12 @@ static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size) { + int retries = 5; int fd; do { fd = sys_bpf(BPF_PROG_LOAD, attr, size); - } while (fd < 0 && errno == EAGAIN); + } while (fd < 0 && errno == EAGAIN && retries-- > 0); return fd; } @@ -214,59 +215,55 @@ alloc_zero_tailing_info(const void *orecord, __u32 cnt, return info; } -int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz) +int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr) { void *finfo = NULL, *linfo = NULL; union bpf_attr attr; - __u32 log_level; int fd; - if (!load_attr || !log_buf != !log_buf_sz) + if (!load_attr->log_buf != !load_attr->log_buf_sz) return -EINVAL; - log_level = load_attr->log_level; - if (log_level > (4 | 2 | 1) || (log_level && !log_buf)) + if (load_attr->log_level > (4 | 2 | 1) || (load_attr->log_level && !load_attr->log_buf)) return -EINVAL; memset(&attr, 0, sizeof(attr)); attr.prog_type = load_attr->prog_type; attr.expected_attach_type = load_attr->expected_attach_type; - if (attr.prog_type == BPF_PROG_TYPE_STRUCT_OPS || - attr.prog_type == BPF_PROG_TYPE_LSM) { - attr.attach_btf_id = load_attr->attach_btf_id; - } else if (attr.prog_type == BPF_PROG_TYPE_TRACING || - attr.prog_type == BPF_PROG_TYPE_EXT) { - attr.attach_btf_id = load_attr->attach_btf_id; + + if (load_attr->attach_prog_fd) attr.attach_prog_fd = load_attr->attach_prog_fd; - } else { - attr.prog_ifindex = load_attr->prog_ifindex; - attr.kern_version = load_attr->kern_version; - } - attr.insn_cnt = (__u32)load_attr->insns_cnt; + else + attr.attach_btf_obj_fd = load_attr->attach_btf_obj_fd; + attr.attach_btf_id = load_attr->attach_btf_id; + + attr.prog_ifindex = load_attr->prog_ifindex; + attr.kern_version = load_attr->kern_version; + + attr.insn_cnt = (__u32)load_attr->insn_cnt; attr.insns = ptr_to_u64(load_attr->insns); attr.license = ptr_to_u64(load_attr->license); - attr.log_level = log_level; - if (log_level) { - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_buf_sz; - } else { - attr.log_buf = ptr_to_u64(NULL); - attr.log_size = 0; + attr.log_level = load_attr->log_level; + if (attr.log_level) { + attr.log_buf = ptr_to_u64(load_attr->log_buf); + attr.log_size = load_attr->log_buf_sz; } attr.prog_btf_fd = load_attr->prog_btf_fd; + attr.prog_flags = load_attr->prog_flags; + attr.func_info_rec_size = load_attr->func_info_rec_size; attr.func_info_cnt = load_attr->func_info_cnt; attr.func_info = ptr_to_u64(load_attr->func_info); + attr.line_info_rec_size = load_attr->line_info_rec_size; attr.line_info_cnt = load_attr->line_info_cnt; attr.line_info = ptr_to_u64(load_attr->line_info); + if (load_attr->name) memcpy(attr.prog_name, load_attr->name, - min(strlen(load_attr->name), BPF_OBJ_NAME_LEN - 1)); - attr.prog_flags = load_attr->prog_flags; + min(strlen(load_attr->name), (size_t)BPF_OBJ_NAME_LEN - 1)); fd = sys_bpf_prog_load(&attr, sizeof(attr)); if (fd >= 0) @@ -306,19 +303,19 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, } fd = sys_bpf_prog_load(&attr, sizeof(attr)); - if (fd >= 0) goto done; } - if (log_level || !log_buf) + if (load_attr->log_level || !load_attr->log_buf) goto done; /* Try again with log */ - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_buf_sz; + attr.log_buf = ptr_to_u64(load_attr->log_buf); + attr.log_size = load_attr->log_buf_sz; attr.log_level = 1; - log_buf[0] = 0; + load_attr->log_buf[0] = 0; + fd = sys_bpf_prog_load(&attr, sizeof(attr)); done: free(finfo); @@ -326,6 +323,49 @@ done: return fd; } +int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, + char *log_buf, size_t log_buf_sz) +{ + struct bpf_prog_load_params p = {}; + + if (!load_attr || !log_buf != !log_buf_sz) + return -EINVAL; + + p.prog_type = load_attr->prog_type; + p.expected_attach_type = load_attr->expected_attach_type; + switch (p.prog_type) { + case BPF_PROG_TYPE_STRUCT_OPS: + case BPF_PROG_TYPE_LSM: + p.attach_btf_id = load_attr->attach_btf_id; + break; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_EXT: + p.attach_btf_id = load_attr->attach_btf_id; + p.attach_prog_fd = load_attr->attach_prog_fd; + break; + default: + p.prog_ifindex = load_attr->prog_ifindex; + p.kern_version = load_attr->kern_version; + } + p.insn_cnt = load_attr->insns_cnt; + p.insns = load_attr->insns; + p.license = load_attr->license; + p.log_level = load_attr->log_level; + p.log_buf = log_buf; + p.log_buf_sz = log_buf_sz; + p.prog_btf_fd = load_attr->prog_btf_fd; + p.func_info_rec_size = load_attr->func_info_rec_size; + p.func_info_cnt = load_attr->func_info_cnt; + p.func_info = load_attr->func_info; + p.line_info_rec_size = load_attr->line_info_rec_size; + p.line_info_cnt = load_attr->line_info_cnt; + p.line_info = load_attr->line_info; + p.name = load_attr->name; + p.prog_flags = load_attr->prog_flags; + + return libbpf__bpf_prog_load(&p); +} + int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, size_t insns_cnt, const char *license, __u32 kern_version, char *log_buf, diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 231b07203e3d..3c3f2bc6c652 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -78,10 +78,32 @@ struct btf { void *types_data; size_t types_data_cap; /* used size stored in hdr->type_len */ - /* type ID to `struct btf_type *` lookup index */ + /* type ID to `struct btf_type *` lookup index + * type_offs[0] corresponds to the first non-VOID type: + * - for base BTF it's type [1]; + * - for split BTF it's the first non-base BTF type. + */ __u32 *type_offs; size_t type_offs_cap; + /* number of types in this BTF instance: + * - doesn't include special [0] void type; + * - for split BTF counts number of types added on top of base BTF. + */ __u32 nr_types; + /* if not NULL, points to the base BTF on top of which the current + * split BTF is based + */ + struct btf *base_btf; + /* BTF type ID of the first type in this BTF instance: + * - for base BTF it's equal to 1; + * - for split BTF it's equal to biggest type ID of base BTF plus 1. + */ + int start_id; + /* logical string offset of this BTF instance: + * - for base BTF it's equal to 0; + * - for split BTF it's equal to total size of base BTF's string section size. + */ + int start_str_off; void *strs_data; size_t strs_data_cap; /* used size stored in hdr->str_len */ @@ -90,6 +112,14 @@ struct btf { struct hashmap *strs_hash; /* whether strings are already deduplicated */ bool strs_deduped; + /* extra indirection layer to make strings hashmap work with stable + * string offsets and ability to transparently choose between + * btf->strs_data or btf_dedup->strs_data as a source of strings. + * This is used for BTF strings dedup to transfer deduplicated strings + * data back to struct btf without re-building strings index. + */ + void **strs_data_ptr; + /* BTF object FD, if loaded into kernel */ int fd; @@ -168,7 +198,7 @@ static int btf_add_type_idx_entry(struct btf *btf, __u32 type_off) __u32 *p; p = btf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), - btf->nr_types + 1, BTF_MAX_NR_TYPES, 1); + btf->nr_types, BTF_MAX_NR_TYPES, 1); if (!p) return -ENOMEM; @@ -215,22 +245,18 @@ static int btf_parse_hdr(struct btf *btf) return -EINVAL; } - if (meta_left < hdr->type_off) { - pr_debug("Invalid BTF type section offset:%u\n", hdr->type_off); + if (meta_left < hdr->str_off + hdr->str_len) { + pr_debug("Invalid BTF total size:%u\n", btf->raw_size); return -EINVAL; } - if (meta_left < hdr->str_off) { - pr_debug("Invalid BTF string section offset:%u\n", hdr->str_off); + if (hdr->type_off + hdr->type_len > hdr->str_off) { + pr_debug("Invalid BTF data sections layout: type data at %u + %u, strings data at %u + %u\n", + hdr->type_off, hdr->type_len, hdr->str_off, hdr->str_len); return -EINVAL; } - if (hdr->type_off >= hdr->str_off) { - pr_debug("BTF type section offset >= string section offset. No type?\n"); - return -EINVAL; - } - - if (hdr->type_off & 0x02) { + if (hdr->type_off % 4) { pr_debug("BTF type section is not aligned to 4 bytes\n"); return -EINVAL; } @@ -244,12 +270,16 @@ static int btf_parse_str_sec(struct btf *btf) const char *start = btf->strs_data; const char *end = start + btf->hdr->str_len; - if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_STR_OFFSET || - start[0] || end[-1]) { + if (btf->base_btf && hdr->str_len == 0) + return 0; + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_STR_OFFSET || end[-1]) { + pr_debug("Invalid BTF string section\n"); + return -EINVAL; + } + if (!btf->base_btf && start[0]) { pr_debug("Invalid BTF string section\n"); return -EINVAL; } - return 0; } @@ -364,19 +394,9 @@ static int btf_parse_type_sec(struct btf *btf) struct btf_header *hdr = btf->hdr; void *next_type = btf->types_data; void *end_type = next_type + hdr->type_len; - int err, i = 0, type_size; - - /* VOID (type_id == 0) is specially handled by btf__get_type_by_id(), - * so ensure we can never properly use its offset from index by - * setting it to a large value - */ - err = btf_add_type_idx_entry(btf, UINT_MAX); - if (err) - return err; + int err, type_size; while (next_type + sizeof(struct btf_type) <= end_type) { - i++; - if (btf->swapped_endian) btf_bswap_type_base(next_type); @@ -384,7 +404,7 @@ static int btf_parse_type_sec(struct btf *btf) if (type_size < 0) return type_size; if (next_type + type_size > end_type) { - pr_warn("BTF type [%d] is malformed\n", i); + pr_warn("BTF type [%d] is malformed\n", btf->start_id + btf->nr_types); return -EINVAL; } @@ -409,7 +429,12 @@ static int btf_parse_type_sec(struct btf *btf) __u32 btf__get_nr_types(const struct btf *btf) { - return btf->nr_types; + return btf->start_id + btf->nr_types - 1; +} + +const struct btf *btf__base_btf(const struct btf *btf) +{ + return btf->base_btf; } /* internal helper returning non-const pointer to a type */ @@ -417,13 +442,14 @@ static struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) { if (type_id == 0) return &btf_void; - - return btf->types_data + btf->type_offs[type_id]; + if (type_id < btf->start_id) + return btf_type_by_id(btf->base_btf, type_id); + return btf->types_data + btf->type_offs[type_id - btf->start_id]; } const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id) { - if (type_id > btf->nr_types) + if (type_id >= btf->start_id + btf->nr_types) return NULL; return btf_type_by_id((struct btf *)btf, type_id); } @@ -432,9 +458,13 @@ static int determine_ptr_size(const struct btf *btf) { const struct btf_type *t; const char *name; - int i; + int i, n; - for (i = 1; i <= btf->nr_types; i++) { + if (btf->base_btf && btf->base_btf->ptr_sz > 0) + return btf->base_btf->ptr_sz; + + n = btf__get_nr_types(btf); + for (i = 1; i <= n; i++) { t = btf__type_by_id(btf, i); if (!btf_is_int(t)) continue; @@ -649,12 +679,12 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id) __s32 btf__find_by_name(const struct btf *btf, const char *type_name) { - __u32 i; + __u32 i, nr_types = btf__get_nr_types(btf); if (!strcmp(type_name, "void")) return 0; - for (i = 1; i <= btf->nr_types; i++) { + for (i = 1; i <= nr_types; i++) { const struct btf_type *t = btf__type_by_id(btf, i); const char *name = btf__name_by_offset(btf, t->name_off); @@ -668,12 +698,12 @@ __s32 btf__find_by_name(const struct btf *btf, const char *type_name) __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, __u32 kind) { - __u32 i; + __u32 i, nr_types = btf__get_nr_types(btf); if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void")) return 0; - for (i = 1; i <= btf->nr_types; i++) { + for (i = 1; i <= nr_types; i++) { const struct btf_type *t = btf__type_by_id(btf, i); const char *name; @@ -717,7 +747,7 @@ void btf__free(struct btf *btf) free(btf); } -struct btf *btf__new_empty(void) +static struct btf *btf_new_empty(struct btf *base_btf) { struct btf *btf; @@ -725,12 +755,21 @@ struct btf *btf__new_empty(void) if (!btf) return ERR_PTR(-ENOMEM); + btf->nr_types = 0; + btf->start_id = 1; + btf->start_str_off = 0; btf->fd = -1; btf->ptr_sz = sizeof(void *); btf->swapped_endian = false; + if (base_btf) { + btf->base_btf = base_btf; + btf->start_id = btf__get_nr_types(base_btf) + 1; + btf->start_str_off = base_btf->hdr->str_len; + } + /* +1 for empty string at offset 0 */ - btf->raw_size = sizeof(struct btf_header) + 1; + btf->raw_size = sizeof(struct btf_header) + (base_btf ? 0 : 1); btf->raw_data = calloc(1, btf->raw_size); if (!btf->raw_data) { free(btf); @@ -744,12 +783,22 @@ struct btf *btf__new_empty(void) btf->types_data = btf->raw_data + btf->hdr->hdr_len; btf->strs_data = btf->raw_data + btf->hdr->hdr_len; - btf->hdr->str_len = 1; /* empty string at offset 0 */ + btf->hdr->str_len = base_btf ? 0 : 1; /* empty string at offset 0 */ return btf; } -struct btf *btf__new(const void *data, __u32 size) +struct btf *btf__new_empty(void) +{ + return btf_new_empty(NULL); +} + +struct btf *btf__new_empty_split(struct btf *base_btf) +{ + return btf_new_empty(base_btf); +} + +static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) { struct btf *btf; int err; @@ -758,6 +807,16 @@ struct btf *btf__new(const void *data, __u32 size) if (!btf) return ERR_PTR(-ENOMEM); + btf->nr_types = 0; + btf->start_id = 1; + btf->start_str_off = 0; + + if (base_btf) { + btf->base_btf = base_btf; + btf->start_id = btf__get_nr_types(base_btf) + 1; + btf->start_str_off = base_btf->hdr->str_len; + } + btf->raw_data = malloc(size); if (!btf->raw_data) { err = -ENOMEM; @@ -790,7 +849,13 @@ done: return btf; } -struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) +struct btf *btf__new(const void *data, __u32 size) +{ + return btf_new(data, size, NULL); +} + +static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, + struct btf_ext **btf_ext) { Elf_Data *btf_data = NULL, *btf_ext_data = NULL; int err = 0, fd = -1, idx = 0; @@ -868,7 +933,7 @@ struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) err = -ENOENT; goto done; } - btf = btf__new(btf_data->d_buf, btf_data->d_size); + btf = btf_new(btf_data->d_buf, btf_data->d_size, base_btf); if (IS_ERR(btf)) goto done; @@ -913,7 +978,17 @@ done: return btf; } -struct btf *btf__parse_raw(const char *path) +struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) +{ + return btf_parse_elf(path, NULL, btf_ext); +} + +struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf) +{ + return btf_parse_elf(path, base_btf, NULL); +} + +static struct btf *btf_parse_raw(const char *path, struct btf *base_btf) { struct btf *btf = NULL; void *data = NULL; @@ -967,7 +1042,7 @@ struct btf *btf__parse_raw(const char *path) } /* finally parse BTF data */ - btf = btf__new(data, sz); + btf = btf_new(data, sz, base_btf); err_out: free(data); @@ -976,18 +1051,38 @@ err_out: return err ? ERR_PTR(err) : btf; } -struct btf *btf__parse(const char *path, struct btf_ext **btf_ext) +struct btf *btf__parse_raw(const char *path) +{ + return btf_parse_raw(path, NULL); +} + +struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf) +{ + return btf_parse_raw(path, base_btf); +} + +static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext) { struct btf *btf; if (btf_ext) *btf_ext = NULL; - btf = btf__parse_raw(path); + btf = btf_parse_raw(path, base_btf); if (!IS_ERR(btf) || PTR_ERR(btf) != -EPROTO) return btf; - return btf__parse_elf(path, btf_ext); + return btf_parse_elf(path, base_btf, btf_ext); +} + +struct btf *btf__parse(const char *path, struct btf_ext **btf_ext) +{ + return btf_parse(path, NULL, btf_ext); +} + +struct btf *btf__parse_split(const char *path, struct btf *base_btf) +{ + return btf_parse(path, base_btf, NULL); } static int compare_vsi_off(const void *_a, const void *_b) @@ -1171,8 +1266,8 @@ static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endi memcpy(p, btf->types_data, hdr->type_len); if (swap_endian) { - for (i = 1; i <= btf->nr_types; i++) { - t = p + btf->type_offs[i]; + for (i = 0; i < btf->nr_types; i++) { + t = p + btf->type_offs[i]; /* btf_bswap_type_rest() relies on native t->info, so * we swap base type info after we swapped all the * additional information @@ -1215,8 +1310,10 @@ const void *btf__get_raw_data(const struct btf *btf_ro, __u32 *size) const char *btf__str_by_offset(const struct btf *btf, __u32 offset) { - if (offset < btf->hdr->str_len) - return btf->strs_data + offset; + if (offset < btf->start_str_off) + return btf__str_by_offset(btf->base_btf, offset); + else if (offset - btf->start_str_off < btf->hdr->str_len) + return btf->strs_data + (offset - btf->start_str_off); else return NULL; } @@ -1226,35 +1323,27 @@ const char *btf__name_by_offset(const struct btf *btf, __u32 offset) return btf__str_by_offset(btf, offset); } -int btf__get_from_id(__u32 id, struct btf **btf) +struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) { - struct bpf_btf_info btf_info = { 0 }; + struct bpf_btf_info btf_info; __u32 len = sizeof(btf_info); __u32 last_size; - int btf_fd; + struct btf *btf; void *ptr; int err; - err = 0; - *btf = NULL; - btf_fd = bpf_btf_get_fd_by_id(id); - if (btf_fd < 0) - return 0; - /* we won't know btf_size until we call bpf_obj_get_info_by_fd(). so * let's start with a sane default - 4KiB here - and resize it only if * bpf_obj_get_info_by_fd() needs a bigger buffer. */ - btf_info.btf_size = 4096; - last_size = btf_info.btf_size; + last_size = 4096; ptr = malloc(last_size); - if (!ptr) { - err = -ENOMEM; - goto exit_free; - } + if (!ptr) + return ERR_PTR(-ENOMEM); - memset(ptr, 0, last_size); + memset(&btf_info, 0, sizeof(btf_info)); btf_info.btf = ptr_to_u64(ptr); + btf_info.btf_size = last_size; err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); if (!err && btf_info.btf_size > last_size) { @@ -1263,31 +1352,48 @@ int btf__get_from_id(__u32 id, struct btf **btf) last_size = btf_info.btf_size; temp_ptr = realloc(ptr, last_size); if (!temp_ptr) { - err = -ENOMEM; + btf = ERR_PTR(-ENOMEM); goto exit_free; } ptr = temp_ptr; - memset(ptr, 0, last_size); + + len = sizeof(btf_info); + memset(&btf_info, 0, sizeof(btf_info)); btf_info.btf = ptr_to_u64(ptr); + btf_info.btf_size = last_size; + err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); } if (err || btf_info.btf_size > last_size) { - err = errno; + btf = err ? ERR_PTR(-errno) : ERR_PTR(-E2BIG); goto exit_free; } - *btf = btf__new((__u8 *)(long)btf_info.btf, btf_info.btf_size); - if (IS_ERR(*btf)) { - err = PTR_ERR(*btf); - *btf = NULL; - } + btf = btf_new(ptr, btf_info.btf_size, base_btf); exit_free: - close(btf_fd); free(ptr); + return btf; +} - return err; +int btf__get_from_id(__u32 id, struct btf **btf) +{ + struct btf *res; + int btf_fd; + + *btf = NULL; + btf_fd = bpf_btf_get_fd_by_id(id); + if (btf_fd < 0) + return -errno; + + res = btf_get_from_fd(btf_fd, NULL); + close(btf_fd); + if (IS_ERR(res)) + return PTR_ERR(res); + + *btf = res; + return 0; } int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, @@ -1363,17 +1469,19 @@ int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, static size_t strs_hash_fn(const void *key, void *ctx) { - struct btf *btf = ctx; - const char *str = btf->strs_data + (long)key; + const struct btf *btf = ctx; + const char *strs = *btf->strs_data_ptr; + const char *str = strs + (long)key; return str_hash(str); } static bool strs_hash_equal_fn(const void *key1, const void *key2, void *ctx) { - struct btf *btf = ctx; - const char *str1 = btf->strs_data + (long)key1; - const char *str2 = btf->strs_data + (long)key2; + const struct btf *btf = ctx; + const char *strs = *btf->strs_data_ptr; + const char *str1 = strs + (long)key1; + const char *str2 = strs + (long)key2; return strcmp(str1, str2) == 0; } @@ -1418,6 +1526,9 @@ static int btf_ensure_modifiable(struct btf *btf) memcpy(types, btf->types_data, btf->hdr->type_len); memcpy(strs, btf->strs_data, btf->hdr->str_len); + /* make hashmap below use btf->strs_data as a source of strings */ + btf->strs_data_ptr = &btf->strs_data; + /* build lookup index for all strings */ hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, btf); if (IS_ERR(hash)) { @@ -1448,7 +1559,10 @@ static int btf_ensure_modifiable(struct btf *btf) /* if BTF was created from scratch, all strings are guaranteed to be * unique and deduplicated */ - btf->strs_deduped = btf->hdr->str_len <= 1; + if (btf->hdr->str_len == 0) + btf->strs_deduped = true; + if (!btf->base_btf && btf->hdr->str_len == 1) + btf->strs_deduped = true; /* invalidate raw_data representation */ btf_invalidate_raw_data(btf); @@ -1480,6 +1594,14 @@ int btf__find_str(struct btf *btf, const char *s) long old_off, new_off, len; void *p; + if (btf->base_btf) { + int ret; + + ret = btf__find_str(btf->base_btf, s); + if (ret != -ENOENT) + return ret; + } + /* BTF needs to be in a modifiable state to build string lookup index */ if (btf_ensure_modifiable(btf)) return -ENOMEM; @@ -1494,7 +1616,7 @@ int btf__find_str(struct btf *btf, const char *s) memcpy(p, s, len); if (hashmap__find(btf->strs_hash, (void *)new_off, (void **)&old_off)) - return old_off; + return btf->start_str_off + old_off; return -ENOENT; } @@ -1510,6 +1632,14 @@ int btf__add_str(struct btf *btf, const char *s) void *p; int err; + if (btf->base_btf) { + int ret; + + ret = btf__find_str(btf->base_btf, s); + if (ret != -ENOENT) + return ret; + } + if (btf_ensure_modifiable(btf)) return -ENOMEM; @@ -1536,12 +1666,12 @@ int btf__add_str(struct btf *btf, const char *s) err = hashmap__insert(btf->strs_hash, (void *)new_off, (void *)new_off, HASHMAP_ADD, (const void **)&old_off, NULL); if (err == -EEXIST) - return old_off; /* duplicated string, return existing offset */ + return btf->start_str_off + old_off; /* duplicated string, return existing offset */ if (err) return err; btf->hdr->str_len += len; /* new unique string, adjust data length */ - return new_off; + return btf->start_str_off + new_off; } static void *btf_add_type_mem(struct btf *btf, size_t add_sz) @@ -1560,6 +1690,20 @@ static void btf_type_inc_vlen(struct btf_type *t) t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, btf_kflag(t)); } +static int btf_commit_type(struct btf *btf, int data_sz) +{ + int err; + + err = btf_add_type_idx_entry(btf, btf->hdr->type_len); + if (err) + return err; + + btf->hdr->type_len += data_sz; + btf->hdr->str_off += data_sz; + btf->nr_types++; + return btf->start_id + btf->nr_types - 1; +} + /* * Append new BTF_KIND_INT type with: * - *name* - non-empty, non-NULL type name; @@ -1572,7 +1716,7 @@ static void btf_type_inc_vlen(struct btf_type *t) int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding) { struct btf_type *t; - int sz, err, name_off; + int sz, name_off; /* non-empty name */ if (!name || !name[0]) @@ -1606,14 +1750,7 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding /* set INT info, we don't allow setting legacy bit offset/size */ *(__u32 *)(t + 1) = (encoding << 24) | (byte_sz * 8); - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* it's completely legal to append BTF types with type IDs pointing forward to @@ -1631,7 +1768,7 @@ static int validate_type_id(int id) static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref_type_id) { struct btf_type *t; - int sz, name_off = 0, err; + int sz, name_off = 0; if (validate_type_id(ref_type_id)) return -EINVAL; @@ -1654,14 +1791,7 @@ static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref t->info = btf_type_info(kind, 0, 0); t->type = ref_type_id; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -1689,7 +1819,7 @@ int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 n { struct btf_type *t; struct btf_array *a; - int sz, err; + int sz; if (validate_type_id(index_type_id) || validate_type_id(elem_type_id)) return -EINVAL; @@ -1711,21 +1841,14 @@ int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 n a->index_type = index_type_id; a->nelems = nr_elems; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* generic STRUCT/UNION append function */ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 bytes_sz) { struct btf_type *t; - int sz, err, name_off = 0; + int sz, name_off = 0; if (btf_ensure_modifiable(btf)) return -ENOMEM; @@ -1748,14 +1871,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 t->info = btf_type_info(kind, 0, 0); t->size = bytes_sz; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -1793,6 +1909,11 @@ int btf__add_union(struct btf *btf, const char *name, __u32 byte_sz) return btf_add_composite(btf, BTF_KIND_UNION, name, byte_sz); } +static struct btf_type *btf_last_type(struct btf *btf) +{ + return btf_type_by_id(btf, btf__get_nr_types(btf)); +} + /* * Append new field for the current STRUCT/UNION type with: * - *name* - name of the field, can be NULL or empty for anonymous field; @@ -1814,7 +1935,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, /* last type should be union/struct */ if (btf->nr_types == 0) return -EINVAL; - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); if (!btf_is_composite(t)) return -EINVAL; @@ -1849,7 +1970,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, m->offset = bit_offset | (bit_size << 24); /* btf_add_type_mem can invalidate t pointer */ - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); /* update parent type's vlen and kflag */ t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, is_bitfield || btf_kflag(t)); @@ -1874,7 +1995,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) { struct btf_type *t; - int sz, err, name_off = 0; + int sz, name_off = 0; /* byte_sz must be power of 2 */ if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 8) @@ -1899,14 +2020,7 @@ int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) t->info = btf_type_info(BTF_KIND_ENUM, 0, 0); t->size = byte_sz; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -1926,7 +2040,7 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) /* last type should be BTF_KIND_ENUM */ if (btf->nr_types == 0) return -EINVAL; - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); if (!btf_is_enum(t)) return -EINVAL; @@ -1953,7 +2067,7 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) v->val = value; /* update parent type's vlen */ - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); btf_type_inc_vlen(t); btf->hdr->type_len += sz; @@ -2093,7 +2207,7 @@ int btf__add_func(struct btf *btf, const char *name, int btf__add_func_proto(struct btf *btf, int ret_type_id) { struct btf_type *t; - int sz, err; + int sz; if (validate_type_id(ret_type_id)) return -EINVAL; @@ -2113,14 +2227,7 @@ int btf__add_func_proto(struct btf *btf, int ret_type_id) t->info = btf_type_info(BTF_KIND_FUNC_PROTO, 0, 0); t->type = ret_type_id; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -2143,7 +2250,7 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id) /* last type should be BTF_KIND_FUNC_PROTO */ if (btf->nr_types == 0) return -EINVAL; - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); if (!btf_is_func_proto(t)) return -EINVAL; @@ -2166,7 +2273,7 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id) p->type = type_id; /* update parent type's vlen */ - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); btf_type_inc_vlen(t); btf->hdr->type_len += sz; @@ -2188,7 +2295,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) { struct btf_type *t; struct btf_var *v; - int sz, err, name_off; + int sz, name_off; /* non-empty name */ if (!name || !name[0]) @@ -2219,14 +2326,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) v = btf_var(t); v->linkage = linkage; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -2244,7 +2344,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz) { struct btf_type *t; - int sz, err, name_off; + int sz, name_off; /* non-empty name */ if (!name || !name[0]) @@ -2267,14 +2367,7 @@ int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz) t->info = btf_type_info(BTF_KIND_DATASEC, 0, 0); t->size = byte_sz; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -2296,7 +2389,7 @@ int btf__add_datasec_var_info(struct btf *btf, int var_type_id, __u32 offset, __ /* last type should be BTF_KIND_DATASEC */ if (btf->nr_types == 0) return -EINVAL; - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); if (!btf_is_datasec(t)) return -EINVAL; @@ -2317,7 +2410,7 @@ int btf__add_datasec_var_info(struct btf *btf, int var_type_id, __u32 offset, __ v->size = byte_sz; /* update parent type's vlen */ - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); btf_type_inc_vlen(t); btf->hdr->type_len += sz; @@ -2639,6 +2732,7 @@ struct btf_dedup; static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, const struct btf_dedup_opts *opts); static void btf_dedup_free(struct btf_dedup *d); +static int btf_dedup_prep(struct btf_dedup *d); static int btf_dedup_strings(struct btf_dedup *d); static int btf_dedup_prim_types(struct btf_dedup *d); static int btf_dedup_struct_types(struct btf_dedup *d); @@ -2797,6 +2891,11 @@ int btf__dedup(struct btf *btf, struct btf_ext *btf_ext, if (btf_ensure_modifiable(btf)) return -ENOMEM; + err = btf_dedup_prep(d); + if (err) { + pr_debug("btf_dedup_prep failed:%d\n", err); + goto done; + } err = btf_dedup_strings(d); if (err < 0) { pr_debug("btf_dedup_strings failed:%d\n", err); @@ -2859,21 +2958,20 @@ struct btf_dedup { __u32 *hypot_list; size_t hypot_cnt; size_t hypot_cap; + /* Whether hypothetical mapping, if successful, would need to adjust + * already canonicalized types (due to a new forward declaration to + * concrete type resolution). In such case, during split BTF dedup + * candidate type would still be considered as different, because base + * BTF is considered to be immutable. + */ + bool hypot_adjust_canon; /* Various option modifying behavior of algorithm */ struct btf_dedup_opts opts; -}; - -struct btf_str_ptr { - const char *str; - __u32 new_off; - bool used; -}; - -struct btf_str_ptrs { - struct btf_str_ptr *ptrs; - const char *data; - __u32 cnt; - __u32 cap; + /* temporary strings deduplication state */ + void *strs_data; + size_t strs_cap; + size_t strs_len; + struct hashmap* strs_hash; }; static long hash_combine(long h, long value) @@ -2914,6 +3012,7 @@ static void btf_dedup_clear_hypot_map(struct btf_dedup *d) for (i = 0; i < d->hypot_cnt; i++) d->hypot_map[d->hypot_list[i]] = BTF_UNPROCESSED_ID; d->hypot_cnt = 0; + d->hypot_adjust_canon = false; } static void btf_dedup_free(struct btf_dedup *d) @@ -2953,7 +3052,7 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, { struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup)); hashmap_hash_fn hash_fn = btf_dedup_identity_hash_fn; - int i, err = 0; + int i, err = 0, type_cnt; if (!d) return ERR_PTR(-ENOMEM); @@ -2973,14 +3072,15 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, goto done; } - d->map = malloc(sizeof(__u32) * (1 + btf->nr_types)); + type_cnt = btf__get_nr_types(btf) + 1; + d->map = malloc(sizeof(__u32) * type_cnt); if (!d->map) { err = -ENOMEM; goto done; } /* special BTF "void" type is made canonical immediately */ d->map[0] = 0; - for (i = 1; i <= btf->nr_types; i++) { + for (i = 1; i < type_cnt; i++) { struct btf_type *t = btf_type_by_id(d->btf, i); /* VAR and DATASEC are never deduped and are self-canonical */ @@ -2990,12 +3090,12 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, d->map[i] = BTF_UNPROCESSED_ID; } - d->hypot_map = malloc(sizeof(__u32) * (1 + btf->nr_types)); + d->hypot_map = malloc(sizeof(__u32) * type_cnt); if (!d->hypot_map) { err = -ENOMEM; goto done; } - for (i = 0; i <= btf->nr_types; i++) + for (i = 0; i < type_cnt; i++) d->hypot_map[i] = BTF_UNPROCESSED_ID; done: @@ -3019,8 +3119,8 @@ static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx) int i, j, r, rec_size; struct btf_type *t; - for (i = 1; i <= d->btf->nr_types; i++) { - t = btf_type_by_id(d->btf, i); + for (i = 0; i < d->btf->nr_types; i++) { + t = btf_type_by_id(d->btf, d->btf->start_id + i); r = fn(&t->name_off, ctx); if (r) return r; @@ -3100,64 +3200,53 @@ static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx) return 0; } -static int str_sort_by_content(const void *a1, const void *a2) +static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) { - const struct btf_str_ptr *p1 = a1; - const struct btf_str_ptr *p2 = a2; - - return strcmp(p1->str, p2->str); -} - -static int str_sort_by_offset(const void *a1, const void *a2) -{ - const struct btf_str_ptr *p1 = a1; - const struct btf_str_ptr *p2 = a2; - - if (p1->str != p2->str) - return p1->str < p2->str ? -1 : 1; - return 0; -} - -static int btf_dedup_str_ptr_cmp(const void *str_ptr, const void *pelem) -{ - const struct btf_str_ptr *p = pelem; - - if (str_ptr != p->str) - return (const char *)str_ptr < p->str ? -1 : 1; - return 0; -} - -static int btf_str_mark_as_used(__u32 *str_off_ptr, void *ctx) -{ - struct btf_str_ptrs *strs; - struct btf_str_ptr *s; + struct btf_dedup *d = ctx; + __u32 str_off = *str_off_ptr; + long old_off, new_off, len; + const char *s; + void *p; + int err; - if (*str_off_ptr == 0) + /* don't touch empty string or string in main BTF */ + if (str_off == 0 || str_off < d->btf->start_str_off) return 0; - strs = ctx; - s = bsearch(strs->data + *str_off_ptr, strs->ptrs, strs->cnt, - sizeof(struct btf_str_ptr), btf_dedup_str_ptr_cmp); - if (!s) - return -EINVAL; - s->used = true; - return 0; -} + s = btf__str_by_offset(d->btf, str_off); + if (d->btf->base_btf) { + err = btf__find_str(d->btf->base_btf, s); + if (err >= 0) { + *str_off_ptr = err; + return 0; + } + if (err != -ENOENT) + return err; + } -static int btf_str_remap_offset(__u32 *str_off_ptr, void *ctx) -{ - struct btf_str_ptrs *strs; - struct btf_str_ptr *s; + len = strlen(s) + 1; - if (*str_off_ptr == 0) - return 0; + new_off = d->strs_len; + p = btf_add_mem(&d->strs_data, &d->strs_cap, 1, new_off, BTF_MAX_STR_OFFSET, len); + if (!p) + return -ENOMEM; - strs = ctx; - s = bsearch(strs->data + *str_off_ptr, strs->ptrs, strs->cnt, - sizeof(struct btf_str_ptr), btf_dedup_str_ptr_cmp); - if (!s) - return -EINVAL; - *str_off_ptr = s->new_off; + memcpy(p, s, len); + + /* Now attempt to add the string, but only if the string with the same + * contents doesn't exist already (HASHMAP_ADD strategy). If such + * string exists, we'll get its offset in old_off (that's old_key). + */ + err = hashmap__insert(d->strs_hash, (void *)new_off, (void *)new_off, + HASHMAP_ADD, (const void **)&old_off, NULL); + if (err == -EEXIST) { + *str_off_ptr = d->btf->start_str_off + old_off; + } else if (err) { + return err; + } else { + *str_off_ptr = d->btf->start_str_off + new_off; + d->strs_len += len; + } return 0; } @@ -3174,118 +3263,71 @@ static int btf_str_remap_offset(__u32 *str_off_ptr, void *ctx) */ static int btf_dedup_strings(struct btf_dedup *d) { - char *start = d->btf->strs_data; - char *end = start + d->btf->hdr->str_len; - char *p = start, *tmp_strs = NULL; - struct btf_str_ptrs strs = { - .cnt = 0, - .cap = 0, - .ptrs = NULL, - .data = start, - }; - int i, j, err = 0, grp_idx; - bool grp_used; + char *s; + int err; if (d->btf->strs_deduped) return 0; - /* build index of all strings */ - while (p < end) { - if (strs.cnt + 1 > strs.cap) { - struct btf_str_ptr *new_ptrs; - - strs.cap += max(strs.cnt / 2, 16U); - new_ptrs = libbpf_reallocarray(strs.ptrs, strs.cap, sizeof(strs.ptrs[0])); - if (!new_ptrs) { - err = -ENOMEM; - goto done; - } - strs.ptrs = new_ptrs; - } - - strs.ptrs[strs.cnt].str = p; - strs.ptrs[strs.cnt].used = false; - - p += strlen(p) + 1; - strs.cnt++; - } + /* temporarily switch to use btf_dedup's strs_data for strings for hash + * functions; later we'll just transfer hashmap to struct btf as is, + * along the strs_data + */ + d->btf->strs_data_ptr = &d->strs_data; - /* temporary storage for deduplicated strings */ - tmp_strs = malloc(d->btf->hdr->str_len); - if (!tmp_strs) { - err = -ENOMEM; - goto done; + d->strs_hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, d->btf); + if (IS_ERR(d->strs_hash)) { + err = PTR_ERR(d->strs_hash); + d->strs_hash = NULL; + goto err_out; } - /* mark all used strings */ - strs.ptrs[0].used = true; - err = btf_for_each_str_off(d, btf_str_mark_as_used, &strs); - if (err) - goto done; - - /* sort strings by context, so that we can identify duplicates */ - qsort(strs.ptrs, strs.cnt, sizeof(strs.ptrs[0]), str_sort_by_content); - - /* - * iterate groups of equal strings and if any instance in a group was - * referenced, emit single instance and remember new offset - */ - p = tmp_strs; - grp_idx = 0; - grp_used = strs.ptrs[0].used; - /* iterate past end to avoid code duplication after loop */ - for (i = 1; i <= strs.cnt; i++) { - /* - * when i == strs.cnt, we want to skip string comparison and go - * straight to handling last group of strings (otherwise we'd - * need to handle last group after the loop w/ duplicated code) - */ - if (i < strs.cnt && - !strcmp(strs.ptrs[i].str, strs.ptrs[grp_idx].str)) { - grp_used = grp_used || strs.ptrs[i].used; - continue; - } + if (!d->btf->base_btf) { + s = btf_add_mem(&d->strs_data, &d->strs_cap, 1, d->strs_len, BTF_MAX_STR_OFFSET, 1); + if (!s) + return -ENOMEM; + /* initial empty string */ + s[0] = 0; + d->strs_len = 1; - /* - * this check would have been required after the loop to handle - * last group of strings, but due to <= condition in a loop - * we avoid that duplication + /* insert empty string; we won't be looking it up during strings + * dedup, but it's good to have it for generic BTF string lookups */ - if (grp_used) { - int new_off = p - tmp_strs; - __u32 len = strlen(strs.ptrs[grp_idx].str); - - memmove(p, strs.ptrs[grp_idx].str, len + 1); - for (j = grp_idx; j < i; j++) - strs.ptrs[j].new_off = new_off; - p += len + 1; - } - - if (i < strs.cnt) { - grp_idx = i; - grp_used = strs.ptrs[i].used; - } + err = hashmap__insert(d->strs_hash, (void *)0, (void *)0, + HASHMAP_ADD, NULL, NULL); + if (err) + goto err_out; } - /* replace original strings with deduped ones */ - d->btf->hdr->str_len = p - tmp_strs; - memmove(start, tmp_strs, d->btf->hdr->str_len); - end = start + d->btf->hdr->str_len; - - /* restore original order for further binary search lookups */ - qsort(strs.ptrs, strs.cnt, sizeof(strs.ptrs[0]), str_sort_by_offset); - /* remap string offsets */ - err = btf_for_each_str_off(d, btf_str_remap_offset, &strs); + err = btf_for_each_str_off(d, strs_dedup_remap_str_off, d); if (err) - goto done; + goto err_out; - d->btf->hdr->str_len = end - start; + /* replace BTF string data and hash with deduped ones */ + free(d->btf->strs_data); + hashmap__free(d->btf->strs_hash); + d->btf->strs_data = d->strs_data; + d->btf->strs_data_cap = d->strs_cap; + d->btf->hdr->str_len = d->strs_len; + d->btf->strs_hash = d->strs_hash; + /* now point strs_data_ptr back to btf->strs_data */ + d->btf->strs_data_ptr = &d->btf->strs_data; + + d->strs_data = d->strs_hash = NULL; + d->strs_len = d->strs_cap = 0; d->btf->strs_deduped = true; + return 0; + +err_out: + free(d->strs_data); + hashmap__free(d->strs_hash); + d->strs_data = d->strs_hash = NULL; + d->strs_len = d->strs_cap = 0; + + /* restore strings pointer for existing d->btf->strs_hash back */ + d->btf->strs_data_ptr = &d->strs_data; -done: - free(tmp_strs); - free(strs.ptrs); return err; } @@ -3550,6 +3592,66 @@ static bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2) return true; } +/* Prepare split BTF for deduplication by calculating hashes of base BTF's + * types and initializing the rest of the state (canonical type mapping) for + * the fixed base BTF part. + */ +static int btf_dedup_prep(struct btf_dedup *d) +{ + struct btf_type *t; + int type_id; + long h; + + if (!d->btf->base_btf) + return 0; + + for (type_id = 1; type_id < d->btf->start_id; type_id++) { + t = btf_type_by_id(d->btf, type_id); + + /* all base BTF types are self-canonical by definition */ + d->map[type_id] = type_id; + + switch (btf_kind(t)) { + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + /* VAR and DATASEC are never hash/deduplicated */ + continue; + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_FWD: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + h = btf_hash_common(t); + break; + case BTF_KIND_INT: + h = btf_hash_int(t); + break; + case BTF_KIND_ENUM: + h = btf_hash_enum(t); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + h = btf_hash_struct(t); + break; + case BTF_KIND_ARRAY: + h = btf_hash_array(t); + break; + case BTF_KIND_FUNC_PROTO: + h = btf_hash_fnproto(t); + break; + default: + pr_debug("unknown kind %d for type [%d]\n", btf_kind(t), type_id); + return -EINVAL; + } + if (btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + } + + return 0; +} + /* * Deduplicate primitive types, that can't reference other types, by calculating * their type signature hash and comparing them with any possible canonical @@ -3643,8 +3745,8 @@ static int btf_dedup_prim_types(struct btf_dedup *d) { int i, err; - for (i = 1; i <= d->btf->nr_types; i++) { - err = btf_dedup_prim_type(d, i); + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_prim_type(d, d->btf->start_id + i); if (err) return err; } @@ -3697,6 +3799,19 @@ static inline __u16 btf_fwd_kind(struct btf_type *t) return btf_kflag(t) ? BTF_KIND_UNION : BTF_KIND_STRUCT; } +/* Check if given two types are identical ARRAY definitions */ +static int btf_dedup_identical_arrays(struct btf_dedup *d, __u32 id1, __u32 id2) +{ + struct btf_type *t1, *t2; + + t1 = btf_type_by_id(d->btf, id1); + t2 = btf_type_by_id(d->btf, id2); + if (!btf_is_array(t1) || !btf_is_array(t2)) + return 0; + + return btf_equal_array(t1, t2); +} + /* * Check equivalence of BTF type graph formed by candidate struct/union (we'll * call it "candidate graph" in this description for brevity) to a type graph @@ -3807,8 +3922,18 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, canon_id = resolve_fwd_id(d, canon_id); hypot_type_id = d->hypot_map[canon_id]; - if (hypot_type_id <= BTF_MAX_NR_TYPES) - return hypot_type_id == cand_id; + if (hypot_type_id <= BTF_MAX_NR_TYPES) { + /* In some cases compiler will generate different DWARF types + * for *identical* array type definitions and use them for + * different fields within the *same* struct. This breaks type + * equivalence check, which makes an assumption that candidate + * types sub-graph has a consistent and deduped-by-compiler + * types within a single CU. So work around that by explicitly + * allowing identical array types here. + */ + return hypot_type_id == cand_id || + btf_dedup_identical_arrays(d, hypot_type_id, cand_id); + } if (btf_dedup_hypot_map_add(d, canon_id, cand_id)) return -ENOMEM; @@ -3834,6 +3959,9 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, } else { real_kind = cand_kind; fwd_kind = btf_fwd_kind(canon_type); + /* we'd need to resolve base FWD to STRUCT/UNION */ + if (fwd_kind == real_kind && canon_id < d->btf->start_id) + d->hypot_adjust_canon = true; } return fwd_kind == real_kind; } @@ -3871,8 +3999,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, return 0; cand_arr = btf_array(cand_type); canon_arr = btf_array(canon_type); - eq = btf_dedup_is_equiv(d, - cand_arr->index_type, canon_arr->index_type); + eq = btf_dedup_is_equiv(d, cand_arr->index_type, canon_arr->index_type); if (eq <= 0) return eq; return btf_dedup_is_equiv(d, cand_arr->type, canon_arr->type); @@ -3955,16 +4082,16 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, */ static void btf_dedup_merge_hypot_map(struct btf_dedup *d) { - __u32 cand_type_id, targ_type_id; + __u32 canon_type_id, targ_type_id; __u16 t_kind, c_kind; __u32 t_id, c_id; int i; for (i = 0; i < d->hypot_cnt; i++) { - cand_type_id = d->hypot_list[i]; - targ_type_id = d->hypot_map[cand_type_id]; + canon_type_id = d->hypot_list[i]; + targ_type_id = d->hypot_map[canon_type_id]; t_id = resolve_type_id(d, targ_type_id); - c_id = resolve_type_id(d, cand_type_id); + c_id = resolve_type_id(d, canon_type_id); t_kind = btf_kind(btf__type_by_id(d->btf, t_id)); c_kind = btf_kind(btf__type_by_id(d->btf, c_id)); /* @@ -3979,9 +4106,26 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d) * stability is not a requirement for STRUCT/UNION equivalence * checks, though. */ + + /* if it's the split BTF case, we still need to point base FWD + * to STRUCT/UNION in a split BTF, because FWDs from split BTF + * will be resolved against base FWD. If we don't point base + * canonical FWD to the resolved STRUCT/UNION, then all the + * FWDs in split BTF won't be correctly resolved to a proper + * STRUCT/UNION. + */ if (t_kind != BTF_KIND_FWD && c_kind == BTF_KIND_FWD) d->map[c_id] = t_id; - else if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD) + + /* if graph equivalence determined that we'd need to adjust + * base canonical types, then we need to only point base FWDs + * to STRUCTs/UNIONs and do no more modifications. For all + * other purposes the type graphs were not equivalent. + */ + if (d->hypot_adjust_canon) + continue; + + if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD) d->map[t_id] = c_id; if ((t_kind == BTF_KIND_STRUCT || t_kind == BTF_KIND_UNION) && @@ -4065,8 +4209,10 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) return eq; if (!eq) continue; - new_id = cand_id; btf_dedup_merge_hypot_map(d); + if (d->hypot_adjust_canon) /* not really equivalent */ + continue; + new_id = cand_id; break; } @@ -4081,8 +4227,8 @@ static int btf_dedup_struct_types(struct btf_dedup *d) { int i, err; - for (i = 1; i <= d->btf->nr_types; i++) { - err = btf_dedup_struct_type(d, i); + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_struct_type(d, d->btf->start_id + i); if (err) return err; } @@ -4225,8 +4371,8 @@ static int btf_dedup_ref_types(struct btf_dedup *d) { int i, err; - for (i = 1; i <= d->btf->nr_types; i++) { - err = btf_dedup_ref_type(d, i); + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_ref_type(d, d->btf->start_id + i); if (err < 0) return err; } @@ -4250,39 +4396,44 @@ static int btf_dedup_ref_types(struct btf_dedup *d) static int btf_dedup_compact_types(struct btf_dedup *d) { __u32 *new_offs; - __u32 next_type_id = 1; + __u32 next_type_id = d->btf->start_id; + const struct btf_type *t; void *p; - int i, len; + int i, id, len; /* we are going to reuse hypot_map to store compaction remapping */ d->hypot_map[0] = 0; - for (i = 1; i <= d->btf->nr_types; i++) - d->hypot_map[i] = BTF_UNPROCESSED_ID; + /* base BTF types are not renumbered */ + for (id = 1; id < d->btf->start_id; id++) + d->hypot_map[id] = id; + for (i = 0, id = d->btf->start_id; i < d->btf->nr_types; i++, id++) + d->hypot_map[id] = BTF_UNPROCESSED_ID; p = d->btf->types_data; - for (i = 1; i <= d->btf->nr_types; i++) { - if (d->map[i] != i) + for (i = 0, id = d->btf->start_id; i < d->btf->nr_types; i++, id++) { + if (d->map[id] != id) continue; - len = btf_type_size(btf__type_by_id(d->btf, i)); + t = btf__type_by_id(d->btf, id); + len = btf_type_size(t); if (len < 0) return len; - memmove(p, btf__type_by_id(d->btf, i), len); - d->hypot_map[i] = next_type_id; - d->btf->type_offs[next_type_id] = p - d->btf->types_data; + memmove(p, t, len); + d->hypot_map[id] = next_type_id; + d->btf->type_offs[next_type_id - d->btf->start_id] = p - d->btf->types_data; p += len; next_type_id++; } /* shrink struct btf's internal types index and update btf_header */ - d->btf->nr_types = next_type_id - 1; - d->btf->type_offs_cap = d->btf->nr_types + 1; + d->btf->nr_types = next_type_id - d->btf->start_id; + d->btf->type_offs_cap = d->btf->nr_types; d->btf->hdr->type_len = p - d->btf->types_data; new_offs = libbpf_reallocarray(d->btf->type_offs, d->btf->type_offs_cap, sizeof(*new_offs)); - if (!new_offs) + if (d->btf->type_offs_cap && !new_offs) return -ENOMEM; d->btf->type_offs = new_offs; d->btf->hdr->str_off = d->btf->hdr->type_len; @@ -4414,8 +4565,8 @@ static int btf_dedup_remap_types(struct btf_dedup *d) { int i, r; - for (i = 1; i <= d->btf->nr_types; i++) { - r = btf_dedup_remap_type(d, i); + for (i = 0; i < d->btf->nr_types; i++) { + r = btf_dedup_remap_type(d, d->btf->start_id + i); if (r < 0) return r; } diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 57247240a20a..1237bcd1dd17 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -31,11 +31,19 @@ enum btf_endianness { }; LIBBPF_API void btf__free(struct btf *btf); + LIBBPF_API struct btf *btf__new(const void *data, __u32 size); +LIBBPF_API struct btf *btf__new_split(const void *data, __u32 size, struct btf *base_btf); LIBBPF_API struct btf *btf__new_empty(void); +LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf); + LIBBPF_API struct btf *btf__parse(const char *path, struct btf_ext **btf_ext); +LIBBPF_API struct btf *btf__parse_split(const char *path, struct btf *base_btf); LIBBPF_API struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext); +LIBBPF_API struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf); LIBBPF_API struct btf *btf__parse_raw(const char *path); +LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf); + LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf); LIBBPF_API int btf__load(struct btf *btf); LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, @@ -43,6 +51,7 @@ LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, __u32 kind); LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf); +LIBBPF_API const struct btf *btf__base_btf(const struct btf *btf); LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id); LIBBPF_API size_t btf__pointer_size(const struct btf *btf); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 28baee7ba1ca..6ae748f6ea11 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -176,6 +176,8 @@ enum kern_feature_id { FEAT_PROBE_READ_KERN, /* BPF_PROG_BIND_MAP is supported */ FEAT_PROG_BIND_MAP, + /* Kernel support for module BTFs */ + FEAT_MODULE_BTF, __FEAT_CNT, }; @@ -276,6 +278,7 @@ struct bpf_program { enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; int prog_ifindex; + __u32 attach_btf_obj_fd; __u32 attach_btf_id; __u32 attach_prog_fd; void *func_info; @@ -402,6 +405,13 @@ struct extern_desc { static LIST_HEAD(bpf_objects_list); +struct module_btf { + struct btf *btf; + char *name; + __u32 id; + int fd; +}; + struct bpf_object { char name[BPF_OBJ_NAME_LEN]; char license[64]; @@ -462,11 +472,19 @@ struct bpf_object { struct list_head list; struct btf *btf; + struct btf_ext *btf_ext; + /* Parse and load BTF vmlinux if any of the programs in the object need * it at load time. */ struct btf *btf_vmlinux; - struct btf_ext *btf_ext; + /* vmlinux BTF override for CO-RE relocations */ + struct btf *btf_vmlinux_override; + /* Lazily initialized kernel module BTFs */ + struct module_btf *btf_modules; + bool btf_modules_loaded; + size_t btf_module_cnt; + size_t btf_module_cap; void *priv; bpf_object_clear_priv_t clear_priv; @@ -2500,7 +2518,7 @@ static int bpf_object__finalize_btf(struct bpf_object *obj) return 0; } -static inline bool libbpf_prog_needs_vmlinux_btf(struct bpf_program *prog) +static bool prog_needs_vmlinux_btf(struct bpf_program *prog) { if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || prog->type == BPF_PROG_TYPE_LSM) @@ -2515,37 +2533,43 @@ static inline bool libbpf_prog_needs_vmlinux_btf(struct bpf_program *prog) return false; } -static int bpf_object__load_vmlinux_btf(struct bpf_object *obj) +static bool obj_needs_vmlinux_btf(const struct bpf_object *obj) { - bool need_vmlinux_btf = false; struct bpf_program *prog; - int i, err; + int i; /* CO-RE relocations need kernel BTF */ if (obj->btf_ext && obj->btf_ext->core_relo_info.len) - need_vmlinux_btf = true; + return true; /* Support for typed ksyms needs kernel BTF */ for (i = 0; i < obj->nr_extern; i++) { const struct extern_desc *ext; ext = &obj->externs[i]; - if (ext->type == EXT_KSYM && ext->ksym.type_id) { - need_vmlinux_btf = true; - break; - } + if (ext->type == EXT_KSYM && ext->ksym.type_id) + return true; } bpf_object__for_each_program(prog, obj) { if (!prog->load) continue; - if (libbpf_prog_needs_vmlinux_btf(prog)) { - need_vmlinux_btf = true; - break; - } + if (prog_needs_vmlinux_btf(prog)) + return true; } - if (!need_vmlinux_btf) + return false; +} + +static int bpf_object__load_vmlinux_btf(struct bpf_object *obj, bool force) +{ + int err; + + /* btf_vmlinux could be loaded earlier */ + if (obj->btf_vmlinux) + return 0; + + if (!force && !obj_needs_vmlinux_btf(obj)) return 0; obj->btf_vmlinux = libbpf_find_kernel_btf(); @@ -3960,6 +3984,35 @@ static int probe_prog_bind_map(void) return ret >= 0; } +static int probe_module_btf(void) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + struct bpf_btf_info info; + __u32 len = sizeof(info); + char name[16]; + int fd, err; + + fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs)); + if (fd < 0) + return 0; /* BTF not supported at all */ + + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + /* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer; + * kernel's module BTF support coincides with support for + * name/name_len fields in struct bpf_btf_info. + */ + err = bpf_obj_get_info_by_fd(fd, &info, &len); + close(fd); + return !err; +} + enum kern_feature_result { FEAT_UNKNOWN = 0, FEAT_SUPPORTED = 1, @@ -4003,7 +4056,10 @@ static struct kern_feature_desc { }, [FEAT_PROG_BIND_MAP] = { "BPF_PROG_BIND_MAP support", probe_prog_bind_map, - } + }, + [FEAT_MODULE_BTF] = { + "module BTF support", probe_module_btf, + }, }; static bool kernel_supports(enum kern_feature_id feat_id) @@ -4603,46 +4659,43 @@ static size_t bpf_core_essential_name_len(const char *name) return n; } -/* dynamically sized list of type IDs */ -struct ids_vec { - __u32 *data; +struct core_cand +{ + const struct btf *btf; + const struct btf_type *t; + const char *name; + __u32 id; +}; + +/* dynamically sized list of type IDs and its associated struct btf */ +struct core_cand_list { + struct core_cand *cands; int len; }; -static void bpf_core_free_cands(struct ids_vec *cand_ids) +static void bpf_core_free_cands(struct core_cand_list *cands) { - free(cand_ids->data); - free(cand_ids); + free(cands->cands); + free(cands); } -static struct ids_vec *bpf_core_find_cands(const struct btf *local_btf, - __u32 local_type_id, - const struct btf *targ_btf) +static int bpf_core_add_cands(struct core_cand *local_cand, + size_t local_essent_len, + const struct btf *targ_btf, + const char *targ_btf_name, + int targ_start_id, + struct core_cand_list *cands) { - size_t local_essent_len, targ_essent_len; - const char *local_name, *targ_name; - const struct btf_type *t, *local_t; - struct ids_vec *cand_ids; - __u32 *new_ids; - int i, err, n; - - local_t = btf__type_by_id(local_btf, local_type_id); - if (!local_t) - return ERR_PTR(-EINVAL); - - local_name = btf__name_by_offset(local_btf, local_t->name_off); - if (str_is_empty(local_name)) - return ERR_PTR(-EINVAL); - local_essent_len = bpf_core_essential_name_len(local_name); - - cand_ids = calloc(1, sizeof(*cand_ids)); - if (!cand_ids) - return ERR_PTR(-ENOMEM); + struct core_cand *new_cands, *cand; + const struct btf_type *t; + const char *targ_name; + size_t targ_essent_len; + int n, i; n = btf__get_nr_types(targ_btf); - for (i = 1; i <= n; i++) { + for (i = targ_start_id; i <= n; i++) { t = btf__type_by_id(targ_btf, i); - if (btf_kind(t) != btf_kind(local_t)) + if (btf_kind(t) != btf_kind(local_cand->t)) continue; targ_name = btf__name_by_offset(targ_btf, t->name_off); @@ -4653,24 +4706,174 @@ static struct ids_vec *bpf_core_find_cands(const struct btf *local_btf, if (targ_essent_len != local_essent_len) continue; - if (strncmp(local_name, targ_name, local_essent_len) == 0) { - pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s\n", - local_type_id, btf_kind_str(local_t), - local_name, i, btf_kind_str(t), targ_name); - new_ids = libbpf_reallocarray(cand_ids->data, - cand_ids->len + 1, - sizeof(*cand_ids->data)); - if (!new_ids) { - err = -ENOMEM; - goto err_out; - } - cand_ids->data = new_ids; - cand_ids->data[cand_ids->len++] = i; + if (strncmp(local_cand->name, targ_name, local_essent_len) != 0) + continue; + + pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s in [%s]\n", + local_cand->id, btf_kind_str(local_cand->t), + local_cand->name, i, btf_kind_str(t), targ_name, + targ_btf_name); + new_cands = libbpf_reallocarray(cands->cands, cands->len + 1, + sizeof(*cands->cands)); + if (!new_cands) + return -ENOMEM; + + cand = &new_cands[cands->len]; + cand->btf = targ_btf; + cand->t = t; + cand->name = targ_name; + cand->id = i; + + cands->cands = new_cands; + cands->len++; + } + return 0; +} + +static int load_module_btfs(struct bpf_object *obj) +{ + struct bpf_btf_info info; + struct module_btf *mod_btf; + struct btf *btf; + char name[64]; + __u32 id = 0, len; + int err, fd; + + if (obj->btf_modules_loaded) + return 0; + + /* don't do this again, even if we find no module BTFs */ + obj->btf_modules_loaded = true; + + /* kernel too old to support module BTFs */ + if (!kernel_supports(FEAT_MODULE_BTF)) + return 0; + + while (true) { + err = bpf_btf_get_next_id(id, &id); + if (err && errno == ENOENT) + return 0; + if (err) { + err = -errno; + pr_warn("failed to iterate BTF objects: %d\n", err); + return err; + } + + fd = bpf_btf_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; /* expected race: BTF was unloaded */ + err = -errno; + pr_warn("failed to get BTF object #%d FD: %d\n", id, err); + return err; } + + len = sizeof(info); + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + err = -errno; + pr_warn("failed to get BTF object #%d info: %d\n", id, err); + goto err_out; + } + + /* ignore non-module BTFs */ + if (!info.kernel_btf || strcmp(name, "vmlinux") == 0) { + close(fd); + continue; + } + + btf = btf_get_from_fd(fd, obj->btf_vmlinux); + if (IS_ERR(btf)) { + pr_warn("failed to load module [%s]'s BTF object #%d: %ld\n", + name, id, PTR_ERR(btf)); + err = PTR_ERR(btf); + goto err_out; + } + + err = btf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, + sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); + if (err) + goto err_out; + + mod_btf = &obj->btf_modules[obj->btf_module_cnt++]; + + mod_btf->btf = btf; + mod_btf->id = id; + mod_btf->fd = fd; + mod_btf->name = strdup(name); + if (!mod_btf->name) { + err = -ENOMEM; + goto err_out; + } + continue; + +err_out: + close(fd); + return err; + } + + return 0; +} + +static struct core_cand_list * +bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 local_type_id) +{ + struct core_cand local_cand = {}; + struct core_cand_list *cands; + const struct btf *main_btf; + size_t local_essent_len; + int err, i; + + local_cand.btf = local_btf; + local_cand.t = btf__type_by_id(local_btf, local_type_id); + if (!local_cand.t) + return ERR_PTR(-EINVAL); + + local_cand.name = btf__name_by_offset(local_btf, local_cand.t->name_off); + if (str_is_empty(local_cand.name)) + return ERR_PTR(-EINVAL); + local_essent_len = bpf_core_essential_name_len(local_cand.name); + + cands = calloc(1, sizeof(*cands)); + if (!cands) + return ERR_PTR(-ENOMEM); + + /* Attempt to find target candidates in vmlinux BTF first */ + main_btf = obj->btf_vmlinux_override ?: obj->btf_vmlinux; + err = bpf_core_add_cands(&local_cand, local_essent_len, main_btf, "vmlinux", 1, cands); + if (err) + goto err_out; + + /* if vmlinux BTF has any candidate, don't got for module BTFs */ + if (cands->len) + return cands; + + /* if vmlinux BTF was overridden, don't attempt to load module BTFs */ + if (obj->btf_vmlinux_override) + return cands; + + /* now look through module BTFs, trying to still find candidates */ + err = load_module_btfs(obj); + if (err) + goto err_out; + + for (i = 0; i < obj->btf_module_cnt; i++) { + err = bpf_core_add_cands(&local_cand, local_essent_len, + obj->btf_modules[i].btf, + obj->btf_modules[i].name, + btf__get_nr_types(obj->btf_vmlinux) + 1, + cands); + if (err) + goto err_out; } - return cand_ids; + + return cands; err_out: - bpf_core_free_cands(cand_ids); + bpf_core_free_cands(cands); return ERR_PTR(err); } @@ -5664,7 +5867,6 @@ static int bpf_core_apply_relo(struct bpf_program *prog, const struct bpf_core_relo *relo, int relo_idx, const struct btf *local_btf, - const struct btf *targ_btf, struct hashmap *cand_cache) { struct bpf_core_spec local_spec, cand_spec, targ_spec = {}; @@ -5672,8 +5874,8 @@ static int bpf_core_apply_relo(struct bpf_program *prog, struct bpf_core_relo_res cand_res, targ_res; const struct btf_type *local_type; const char *local_name; - struct ids_vec *cand_ids; - __u32 local_id, cand_id; + struct core_cand_list *cands = NULL; + __u32 local_id; const char *spec_str; int i, j, err; @@ -5720,24 +5922,24 @@ static int bpf_core_apply_relo(struct bpf_program *prog, return -EOPNOTSUPP; } - if (!hashmap__find(cand_cache, type_key, (void **)&cand_ids)) { - cand_ids = bpf_core_find_cands(local_btf, local_id, targ_btf); - if (IS_ERR(cand_ids)) { - pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld", + if (!hashmap__find(cand_cache, type_key, (void **)&cands)) { + cands = bpf_core_find_cands(prog->obj, local_btf, local_id); + if (IS_ERR(cands)) { + pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n", prog->name, relo_idx, local_id, btf_kind_str(local_type), - local_name, PTR_ERR(cand_ids)); - return PTR_ERR(cand_ids); + local_name, PTR_ERR(cands)); + return PTR_ERR(cands); } - err = hashmap__set(cand_cache, type_key, cand_ids, NULL, NULL); + err = hashmap__set(cand_cache, type_key, cands, NULL, NULL); if (err) { - bpf_core_free_cands(cand_ids); + bpf_core_free_cands(cands); return err; } } - for (i = 0, j = 0; i < cand_ids->len; i++) { - cand_id = cand_ids->data[i]; - err = bpf_core_spec_match(&local_spec, targ_btf, cand_id, &cand_spec); + for (i = 0, j = 0; i < cands->len; i++) { + err = bpf_core_spec_match(&local_spec, cands->cands[i].btf, + cands->cands[i].id, &cand_spec); if (err < 0) { pr_warn("prog '%s': relo #%d: error matching candidate #%d ", prog->name, relo_idx, i); @@ -5781,7 +5983,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, return -EINVAL; } - cand_ids->data[j++] = cand_spec.root_type_id; + cands->cands[j++] = cands->cands[i]; } /* @@ -5793,7 +5995,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, * depending on relo's kind. */ if (j > 0) - cand_ids->len = j; + cands->len = j; /* * If no candidates were found, it might be both a programmer error, @@ -5837,20 +6039,19 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) struct hashmap_entry *entry; struct hashmap *cand_cache = NULL; struct bpf_program *prog; - struct btf *targ_btf; const char *sec_name; int i, err = 0, insn_idx, sec_idx; if (obj->btf_ext->core_relo_info.len == 0) return 0; - if (targ_btf_path) - targ_btf = btf__parse(targ_btf_path, NULL); - else - targ_btf = obj->btf_vmlinux; - if (IS_ERR_OR_NULL(targ_btf)) { - pr_warn("failed to get target BTF: %ld\n", PTR_ERR(targ_btf)); - return PTR_ERR(targ_btf); + if (targ_btf_path) { + obj->btf_vmlinux_override = btf__parse(targ_btf_path, NULL); + if (IS_ERR_OR_NULL(obj->btf_vmlinux_override)) { + err = PTR_ERR(obj->btf_vmlinux_override); + pr_warn("failed to parse target BTF: %d\n", err); + return err; + } } cand_cache = hashmap__new(bpf_core_hash_fn, bpf_core_equal_fn, NULL); @@ -5902,8 +6103,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) if (!prog->load) continue; - err = bpf_core_apply_relo(prog, rec, i, obj->btf, - targ_btf, cand_cache); + err = bpf_core_apply_relo(prog, rec, i, obj->btf, cand_cache); if (err) { pr_warn("prog '%s': relo #%d: failed to relocate: %d\n", prog->name, i, err); @@ -5913,9 +6113,10 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) } out: - /* obj->btf_vmlinux is freed at the end of object load phase */ - if (targ_btf != obj->btf_vmlinux) - btf__free(targ_btf); + /* obj->btf_vmlinux and module BTFs are freed after object load */ + btf__free(obj->btf_vmlinux_override); + obj->btf_vmlinux_override = NULL; + if (!IS_ERR_OR_NULL(cand_cache)) { hashmap__for_each_entry(cand_cache, entry, i) { bpf_core_free_cands(entry->value); @@ -6626,16 +6827,25 @@ static int load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, char *license, __u32 kern_version, int *pfd) { - struct bpf_load_program_attr load_attr; + struct bpf_prog_load_params load_attr = {}; char *cp, errmsg[STRERR_BUFSIZE]; size_t log_buf_size = 0; char *log_buf = NULL; int btf_fd, ret; + if (prog->type == BPF_PROG_TYPE_UNSPEC) { + /* + * The program type must be set. Most likely we couldn't find a proper + * section definition at load time, and thus we didn't infer the type. + */ + pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", + prog->name, prog->sec_name); + return -EINVAL; + } + if (!insns || !insns_cnt) return -EINVAL; - memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); load_attr.prog_type = prog->type; /* old kernels might not support specifying expected_attach_type */ if (!kernel_supports(FEAT_EXP_ATTACH_TYPE) && prog->sec_def && @@ -6646,19 +6856,17 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, if (kernel_supports(FEAT_PROG_NAME)) load_attr.name = prog->name; load_attr.insns = insns; - load_attr.insns_cnt = insns_cnt; + load_attr.insn_cnt = insns_cnt; load_attr.license = license; - if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || - prog->type == BPF_PROG_TYPE_LSM) { - load_attr.attach_btf_id = prog->attach_btf_id; - } else if (prog->type == BPF_PROG_TYPE_TRACING || - prog->type == BPF_PROG_TYPE_EXT) { + load_attr.attach_btf_id = prog->attach_btf_id; + if (prog->attach_prog_fd) load_attr.attach_prog_fd = prog->attach_prog_fd; - load_attr.attach_btf_id = prog->attach_btf_id; - } else { - load_attr.kern_version = kern_version; - load_attr.prog_ifindex = prog->prog_ifindex; - } + else + load_attr.attach_btf_obj_fd = prog->attach_btf_obj_fd; + load_attr.attach_btf_id = prog->attach_btf_id; + load_attr.kern_version = kern_version; + load_attr.prog_ifindex = prog->prog_ifindex; + /* specify func_info/line_info only if kernel supports them */ btf_fd = bpf_object__btf_fd(prog->obj); if (btf_fd >= 0 && kernel_supports(FEAT_BTF_FUNC)) { @@ -6682,7 +6890,9 @@ retry_load: *log_buf = 0; } - ret = bpf_load_program_xattr(&load_attr, log_buf, log_buf_size); + load_attr.log_buf = log_buf; + load_attr.log_buf_sz = log_buf_size; + ret = libbpf__bpf_prog_load(&load_attr); if (ret >= 0) { if (log_buf && load_attr.log_level) @@ -6723,9 +6933,9 @@ retry_load: pr_warn("-- BEGIN DUMP LOG ---\n"); pr_warn("\n%s\n", log_buf); pr_warn("-- END LOG --\n"); - } else if (load_attr.insns_cnt >= BPF_MAXINSNS) { + } else if (load_attr.insn_cnt >= BPF_MAXINSNS) { pr_warn("Program too large (%zu insns), at most %d insns\n", - load_attr.insns_cnt, BPF_MAXINSNS); + load_attr.insn_cnt, BPF_MAXINSNS); ret = -LIBBPF_ERRNO__PROG2BIG; } else if (load_attr.prog_type != BPF_PROG_TYPE_KPROBE) { /* Wrong program type? */ @@ -6733,7 +6943,9 @@ retry_load: load_attr.prog_type = BPF_PROG_TYPE_KPROBE; load_attr.expected_attach_type = 0; - fd = bpf_load_program_xattr(&load_attr, NULL, 0); + load_attr.log_buf = NULL; + load_attr.log_buf_sz = 0; + fd = libbpf__bpf_prog_load(&load_attr); if (fd >= 0) { close(fd); ret = -LIBBPF_ERRNO__PROGTYPE; @@ -6746,11 +6958,11 @@ out: return ret; } -static int libbpf_find_attach_btf_id(struct bpf_program *prog); +static int libbpf_find_attach_btf_id(struct bpf_program *prog, int *btf_obj_fd, int *btf_type_id); int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) { - int err = 0, fd, i, btf_id; + int err = 0, fd, i; if (prog->obj->loaded) { pr_warn("prog '%s': can't load after object was loaded\n", prog->name); @@ -6760,10 +6972,14 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) if ((prog->type == BPF_PROG_TYPE_TRACING || prog->type == BPF_PROG_TYPE_LSM || prog->type == BPF_PROG_TYPE_EXT) && !prog->attach_btf_id) { - btf_id = libbpf_find_attach_btf_id(prog); - if (btf_id <= 0) - return btf_id; - prog->attach_btf_id = btf_id; + int btf_obj_fd = 0, btf_type_id = 0; + + err = libbpf_find_attach_btf_id(prog, &btf_obj_fd, &btf_type_id); + if (err) + return err; + + prog->attach_btf_obj_fd = btf_obj_fd; + prog->attach_btf_id = btf_type_id; } if (prog->instances.nr < 0 || !prog->instances.fds) { @@ -6923,9 +7139,12 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, bpf_object__for_each_program(prog, obj) { prog->sec_def = find_sec_def(prog->sec_name); - if (!prog->sec_def) + if (!prog->sec_def) { /* couldn't guess, but user might manually specify */ + pr_debug("prog '%s': unrecognized ELF section name '%s'\n", + prog->name, prog->sec_name); continue; + } if (prog->sec_def->is_sleepable) prog->prog_flags |= BPF_F_SLEEPABLE; @@ -7262,7 +7481,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) } err = bpf_object__probe_loading(obj); - err = err ? : bpf_object__load_vmlinux_btf(obj); + err = err ? : bpf_object__load_vmlinux_btf(obj, false); err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_maps(obj); @@ -7271,6 +7490,15 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) err = err ? : bpf_object__relocate(obj, attr->target_btf_path); err = err ? : bpf_object__load_progs(obj, attr->log_level); + /* clean up module BTFs */ + for (i = 0; i < obj->btf_module_cnt; i++) { + close(obj->btf_modules[i].fd); + btf__free(obj->btf_modules[i].btf); + free(obj->btf_modules[i].name); + } + free(obj->btf_modules); + + /* clean up vmlinux BTF */ btf__free(obj->btf_vmlinux); obj->btf_vmlinux = NULL; @@ -7649,6 +7877,16 @@ bool bpf_map__is_pinned(const struct bpf_map *map) return map->pinned; } +static void sanitize_pin_path(char *s) +{ + /* bpffs disallows periods in path names */ + while (*s) { + if (*s == '.') + *s = '_'; + s++; + } +} + int bpf_object__pin_maps(struct bpf_object *obj, const char *path) { struct bpf_map *map; @@ -7678,6 +7916,7 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path) err = -ENAMETOOLONG; goto err_unpin_maps; } + sanitize_pin_path(buf); pin_path = buf; } else if (!map->pin_path) { continue; @@ -7722,6 +7961,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) return -EINVAL; else if (len >= PATH_MAX) return -ENAMETOOLONG; + sanitize_pin_path(buf); pin_path = buf; } else if (!map->pin_path) { continue; @@ -8607,8 +8847,8 @@ static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, return btf__find_by_name_kind(btf, btf_type_name, kind); } -static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, - enum bpf_attach_type attach_type) +static inline int find_attach_btf_id(struct btf *btf, const char *name, + enum bpf_attach_type attach_type) { int err; @@ -8624,9 +8864,6 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, else err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); - if (err <= 0) - pr_warn("%s is not found in vmlinux BTF\n", name); - return err; } @@ -8642,7 +8879,10 @@ int libbpf_find_vmlinux_btf_id(const char *name, return -EINVAL; } - err = __find_vmlinux_btf_id(btf, name, attach_type); + err = find_attach_btf_id(btf, name, attach_type); + if (err <= 0) + pr_warn("%s is not found in vmlinux BTF\n", name); + btf__free(btf); return err; } @@ -8680,11 +8920,49 @@ out: return err; } -static int libbpf_find_attach_btf_id(struct bpf_program *prog) +static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, + enum bpf_attach_type attach_type, + int *btf_obj_fd, int *btf_type_id) +{ + int ret, i; + + ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); + if (ret > 0) { + *btf_obj_fd = 0; /* vmlinux BTF */ + *btf_type_id = ret; + return 0; + } + if (ret != -ENOENT) + return ret; + + ret = load_module_btfs(obj); + if (ret) + return ret; + + for (i = 0; i < obj->btf_module_cnt; i++) { + const struct module_btf *mod = &obj->btf_modules[i]; + + ret = find_attach_btf_id(mod->btf, attach_name, attach_type); + if (ret > 0) { + *btf_obj_fd = mod->fd; + *btf_type_id = ret; + return 0; + } + if (ret == -ENOENT) + continue; + + return ret; + } + + return -ESRCH; +} + +static int libbpf_find_attach_btf_id(struct bpf_program *prog, int *btf_obj_fd, int *btf_type_id) { enum bpf_attach_type attach_type = prog->expected_attach_type; __u32 attach_prog_fd = prog->attach_prog_fd; - const char *name = prog->sec_name; + const char *name = prog->sec_name, *attach_name; + const struct bpf_sec_def *sec = NULL; int i, err; if (!name) @@ -8695,17 +8973,37 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog) continue; if (strncmp(name, section_defs[i].sec, section_defs[i].len)) continue; - if (attach_prog_fd) - err = libbpf_find_prog_btf_id(name + section_defs[i].len, - attach_prog_fd); - else - err = __find_vmlinux_btf_id(prog->obj->btf_vmlinux, - name + section_defs[i].len, - attach_type); + + sec = §ion_defs[i]; + break; + } + + if (!sec) { + pr_warn("failed to identify BTF ID based on ELF section name '%s'\n", name); + return -ESRCH; + } + attach_name = name + sec->len; + + /* BPF program's BTF ID */ + if (attach_prog_fd) { + err = libbpf_find_prog_btf_id(attach_name, attach_prog_fd); + if (err < 0) { + pr_warn("failed to find BPF program (FD %d) BTF ID for '%s': %d\n", + attach_prog_fd, attach_name, err); + return err; + } + *btf_obj_fd = 0; + *btf_type_id = err; + return 0; + } + + /* kernel/module BTF ID */ + err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id); + if (err) { + pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err); return err; } - pr_warn("failed to identify btf_id based on ELF section name '%s'\n", name); - return -ESRCH; + return 0; } int libbpf_attach_type_by_name(const char *name, @@ -10578,22 +10876,33 @@ int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name) { - int btf_id; + int btf_obj_fd = 0, btf_id = 0, err; if (!prog || attach_prog_fd < 0 || !attach_func_name) return -EINVAL; - if (attach_prog_fd) + if (prog->obj->loaded) + return -EINVAL; + + if (attach_prog_fd) { btf_id = libbpf_find_prog_btf_id(attach_func_name, attach_prog_fd); - else - btf_id = libbpf_find_vmlinux_btf_id(attach_func_name, - prog->expected_attach_type); - - if (btf_id < 0) - return btf_id; + if (btf_id < 0) + return btf_id; + } else { + /* load btf_vmlinux, if not yet */ + err = bpf_object__load_vmlinux_btf(prog->obj, true); + if (err) + return err; + err = find_kernel_btf_id(prog->obj, attach_func_name, + prog->expected_attach_type, + &btf_obj_fd, &btf_id); + if (err) + return err; + } prog->attach_btf_id = btf_id; + prog->attach_btf_obj_fd = btf_obj_fd; prog->attach_prog_fd = attach_prog_fd; return 0; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 6909ee81113a..3c35eb401931 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -536,6 +536,7 @@ LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, ring_buffer_sample_fn sample_cb, void *ctx); LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); /* Perf buffer APIs */ struct perf_buffer; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 4ebfadf45b47..1c0fd2dd233a 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -337,3 +337,16 @@ LIBBPF_0.2.0 { perf_buffer__consume_buffer; xsk_socket__create_shared; } LIBBPF_0.1.0; + +LIBBPF_0.3.0 { + global: + btf__base_btf; + btf__parse_elf_split; + btf__parse_raw_split; + btf__parse_split; + btf__new_empty_split; + btf__new_split; + ring_buffer__epoll_fd; + xsk_setup_xdp_prog; + xsk_socket__update_xskmap; +} LIBBPF_0.2.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index d99bc847bf84..969d0ac592ba 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -151,10 +151,41 @@ int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); int libbpf__load_raw_btf(const char *raw_types, size_t types_len, const char *str_sec, size_t str_len); +struct bpf_prog_load_params { + enum bpf_prog_type prog_type; + enum bpf_attach_type expected_attach_type; + const char *name; + const struct bpf_insn *insns; + size_t insn_cnt; + const char *license; + __u32 kern_version; + __u32 attach_prog_fd; + __u32 attach_btf_obj_fd; + __u32 attach_btf_id; + __u32 prog_ifindex; + __u32 prog_btf_fd; + __u32 prog_flags; + + __u32 func_info_rec_size; + const void *func_info; + __u32 func_info_cnt; + + __u32 line_info_rec_size; + const void *line_info; + __u32 line_info_cnt; + + __u32 log_level; + char *log_buf; + size_t log_buf_sz; +}; + +int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr); + int bpf_object__section_size(const struct bpf_object *obj, const char *name, __u32 *size); int bpf_object__variable_offset(const struct bpf_object *obj, const char *name, __u32 *off); +struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf); struct btf_ext_info { /* diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 5482a9b7ae2d..ecaae2927ab8 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -230,6 +230,7 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) break; case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: btf_key_type_id = 1; btf_value_type_id = 3; value_size = 8; diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 5c6522c89af1..8caaafe7e312 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -278,7 +278,13 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) err = ringbuf_process_ring(ring); if (err < 0) return err; - res += cnt; + res += err; } return cnt < 0 ? -errno : res; } + +/* Get an fd that can be used to sleep until data is available in the ring(s) */ +int ring_buffer__epoll_fd(const struct ring_buffer *rb) +{ + return rb->epoll_fd; +} diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 9bc537d0b92d..e3e41ceeb1bc 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -566,8 +566,35 @@ static int xsk_set_bpf_maps(struct xsk_socket *xsk) &xsk->fd, 0); } -static int xsk_setup_xdp_prog(struct xsk_socket *xsk) +static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) { + char ifname[IFNAMSIZ]; + struct xsk_ctx *ctx; + char *interface; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + interface = if_indextoname(ifindex, &ifname[0]); + if (!interface) { + free(ctx); + return -errno; + } + + ctx->ifindex = ifindex; + memcpy(ctx->ifname, ifname, IFNAMSIZ -1); + ctx->ifname[IFNAMSIZ - 1] = 0; + + xsk->ctx = ctx; + + return 0; +} + +static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, + int *xsks_map_fd) +{ + struct xsk_socket *xsk = _xdp; struct xsk_ctx *ctx = xsk->ctx; __u32 prog_id = 0; int err; @@ -584,8 +611,7 @@ static int xsk_setup_xdp_prog(struct xsk_socket *xsk) err = xsk_load_xdp_prog(xsk); if (err) { - xsk_delete_bpf_maps(xsk); - return err; + goto err_load_xdp_prog; } } else { ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); @@ -598,15 +624,29 @@ static int xsk_setup_xdp_prog(struct xsk_socket *xsk) } } - if (xsk->rx) + if (xsk->rx) { err = xsk_set_bpf_maps(xsk); - if (err) { - xsk_delete_bpf_maps(xsk); - close(ctx->prog_fd); - return err; + if (err) { + if (!prog_id) { + goto err_set_bpf_maps; + } else { + close(ctx->prog_fd); + return err; + } + } } + if (xsks_map_fd) + *xsks_map_fd = ctx->xsks_map_fd; return 0; + +err_set_bpf_maps: + close(ctx->prog_fd); + bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); +err_load_xdp_prog: + xsk_delete_bpf_maps(xsk); + + return err; } static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, @@ -689,6 +729,40 @@ static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, return ctx; } +static void xsk_destroy_xsk_struct(struct xsk_socket *xsk) +{ + free(xsk->ctx); + free(xsk); +} + +int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd) +{ + xsk->ctx->xsks_map_fd = fd; + return xsk_set_bpf_maps(xsk); +} + +int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd) +{ + struct xsk_socket *xsk; + int res; + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + return -ENOMEM; + + res = xsk_create_xsk_struct(ifindex, xsk); + if (res) { + free(xsk); + return -EINVAL; + } + + res = __xsk_setup_xdp_prog(xsk, xsks_map_fd); + + xsk_destroy_xsk_struct(xsk); + + return res; +} + int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, const char *ifname, __u32 queue_id, struct xsk_umem *umem, @@ -838,7 +912,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, ctx->prog_fd = -1; if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { - err = xsk_setup_xdp_prog(xsk); + err = __xsk_setup_xdp_prog(xsk, NULL); if (err) goto out_mmap_tx; } diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index 1069c46364ff..e9f121f5d129 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -113,8 +113,7 @@ static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) return (entries > nb) ? nb : entries; } -static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod, - size_t nb, __u32 *idx) +static inline __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx) { if (xsk_prod_nb_free(prod, nb) < nb) return 0; @@ -125,7 +124,7 @@ static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod, return nb; } -static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb) +static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) { /* Make sure everything has been written to the ring before indicating * this to the kernel by writing the producer pointer. @@ -135,10 +134,9 @@ static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb) *prod->producer += nb; } -static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons, - size_t nb, __u32 *idx) +static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) { - size_t entries = xsk_cons_nb_avail(cons, nb); + __u32 entries = xsk_cons_nb_avail(cons, nb); if (entries > 0) { /* Make sure we do not speculatively read the data before @@ -153,7 +151,12 @@ static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons, return entries; } -static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb) +static inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb) +{ + cons->cached_cons -= nb; +} + +static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) { /* Make sure data has been read before indicating we are done * with the entries by updating the consumer pointer. @@ -201,6 +204,11 @@ struct xsk_umem_config { __u32 flags; }; +LIBBPF_API int xsk_setup_xdp_prog(int ifindex, + int *xsks_map_fd); +LIBBPF_API int xsk_socket__update_xskmap(struct xsk_socket *xsk, + int xsks_map_fd); + /* Flags for the libbpf_flags field. */ #define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0) diff --git a/tools/memory-model/Documentation/README b/tools/memory-model/Documentation/README new file mode 100644 index 000000000000..db90a26dbdf4 --- /dev/null +++ b/tools/memory-model/Documentation/README @@ -0,0 +1,76 @@ +It has been said that successful communication requires first identifying +what your audience knows and then building a bridge from their current +knowledge to what they need to know. Unfortunately, the expected +Linux-kernel memory model (LKMM) audience might be anywhere from novice +to expert both in kernel hacking and in understanding LKMM. + +This document therefore points out a number of places to start reading, +depending on what you know and what you would like to learn. Please note +that the documents later in this list assume that the reader understands +the material provided by documents earlier in this list. + +o You are new to Linux-kernel concurrency: simple.txt + +o You have some background in Linux-kernel concurrency, and would + like an overview of the types of low-level concurrency primitives + that the Linux kernel provides: ordering.txt + + Here, "low level" means atomic operations to single variables. + +o You are familiar with the Linux-kernel concurrency primitives + that you need, and just want to get started with LKMM litmus + tests: litmus-tests.txt + +o You are familiar with Linux-kernel concurrency, and would + like a detailed intuitive understanding of LKMM, including + situations involving more than two threads: recipes.txt + +o You would like a detailed understanding of what your compiler can + and cannot do to control dependencies: control-dependencies.txt + +o You are familiar with Linux-kernel concurrency and the use of + LKMM, and would like a quick reference: cheatsheet.txt + +o You are familiar with Linux-kernel concurrency and the use + of LKMM, and would like to learn about LKMM's requirements, + rationale, and implementation: explanation.txt + +o You are interested in the publications related to LKMM, including + hardware manuals, academic literature, standards-committee + working papers, and LWN articles: references.txt + + +==================== +DESCRIPTION OF FILES +==================== + +README + This file. + +cheatsheet.txt + Quick-reference guide to the Linux-kernel memory model. + +control-dependencies.txt + Guide to preventing compiler optimizations from destroying + your control dependencies. + +explanation.txt + Detailed description of the memory model. + +litmus-tests.txt + The format, features, capabilities, and limitations of the litmus + tests that LKMM can evaluate. + +ordering.txt + Overview of the Linux kernel's low-level memory-ordering + primitives by category. + +recipes.txt + Common memory-ordering patterns. + +references.txt + Background information. + +simple.txt + Starting point for someone new to Linux-kernel concurrency. + And also a reminder of the simpler approaches to concurrency! diff --git a/tools/memory-model/Documentation/control-dependencies.txt b/tools/memory-model/Documentation/control-dependencies.txt new file mode 100644 index 000000000000..8b743d20fe27 --- /dev/null +++ b/tools/memory-model/Documentation/control-dependencies.txt @@ -0,0 +1,258 @@ +CONTROL DEPENDENCIES +==================== + +A major difficulty with control dependencies is that current compilers +do not support them. One purpose of this document is therefore to +help you prevent your compiler from breaking your code. However, +control dependencies also pose other challenges, which leads to the +second purpose of this document, namely to help you to avoid breaking +your own code, even in the absence of help from your compiler. + +One such challenge is that control dependencies order only later stores. +Therefore, a load-load control dependency will not preserve ordering +unless a read memory barrier is provided. Consider the following code: + + q = READ_ONCE(a); + if (q) + p = READ_ONCE(b); + +This is not guaranteed to provide any ordering because some types of CPUs +are permitted to predict the result of the load from "b". This prediction +can cause other CPUs to see this load as having happened before the load +from "a". This means that an explicit read barrier is required, for example +as follows: + + q = READ_ONCE(a); + if (q) { + smp_rmb(); + p = READ_ONCE(b); + } + +However, stores are not speculated. This means that ordering is +(usually) guaranteed for load-store control dependencies, as in the +following example: + + q = READ_ONCE(a); + if (q) + WRITE_ONCE(b, 1); + +Control dependencies can pair with each other and with other types +of ordering. But please note that neither the READ_ONCE() nor the +WRITE_ONCE() are optional. Without the READ_ONCE(), the compiler might +fuse the load from "a" with other loads. Without the WRITE_ONCE(), +the compiler might fuse the store to "b" with other stores. Worse yet, +the compiler might convert the store into a load and a check followed +by a store, and this compiler-generated load would not be ordered by +the control dependency. + +Furthermore, if the compiler is able to prove that the value of variable +"a" is always non-zero, it would be well within its rights to optimize +the original example by eliminating the "if" statement as follows: + + q = a; + b = 1; /* BUG: Compiler and CPU can both reorder!!! */ + +So don't leave out either the READ_ONCE() or the WRITE_ONCE(). +In particular, although READ_ONCE() does force the compiler to emit a +load, it does *not* force the compiler to actually use the loaded value. + +It is tempting to try use control dependencies to enforce ordering on +identical stores on both branches of the "if" statement as follows: + + q = READ_ONCE(a); + if (q) { + barrier(); + WRITE_ONCE(b, 1); + do_something(); + } else { + barrier(); + WRITE_ONCE(b, 1); + do_something_else(); + } + +Unfortunately, current compilers will transform this as follows at high +optimization levels: + + q = READ_ONCE(a); + barrier(); + WRITE_ONCE(b, 1); /* BUG: No ordering vs. load from a!!! */ + if (q) { + /* WRITE_ONCE(b, 1); -- moved up, BUG!!! */ + do_something(); + } else { + /* WRITE_ONCE(b, 1); -- moved up, BUG!!! */ + do_something_else(); + } + +Now there is no conditional between the load from "a" and the store to +"b", which means that the CPU is within its rights to reorder them: The +conditional is absolutely required, and must be present in the final +assembly code, after all of the compiler and link-time optimizations +have been applied. Therefore, if you need ordering in this example, +you must use explicit memory ordering, for example, smp_store_release(): + + q = READ_ONCE(a); + if (q) { + smp_store_release(&b, 1); + do_something(); + } else { + smp_store_release(&b, 1); + do_something_else(); + } + +Without explicit memory ordering, control-dependency-based ordering is +guaranteed only when the stores differ, for example: + + q = READ_ONCE(a); + if (q) { + WRITE_ONCE(b, 1); + do_something(); + } else { + WRITE_ONCE(b, 2); + do_something_else(); + } + +The initial READ_ONCE() is still required to prevent the compiler from +knowing too much about the value of "a". + +But please note that you need to be careful what you do with the local +variable "q", otherwise the compiler might be able to guess the value +and again remove the conditional branch that is absolutely required to +preserve ordering. For example: + + q = READ_ONCE(a); + if (q % MAX) { + WRITE_ONCE(b, 1); + do_something(); + } else { + WRITE_ONCE(b, 2); + do_something_else(); + } + +If MAX is compile-time defined to be 1, then the compiler knows that +(q % MAX) must be equal to zero, regardless of the value of "q". +The compiler is therefore within its rights to transform the above code +into the following: + + q = READ_ONCE(a); + WRITE_ONCE(b, 2); + do_something_else(); + +Given this transformation, the CPU is not required to respect the ordering +between the load from variable "a" and the store to variable "b". It is +tempting to add a barrier(), but this does not help. The conditional +is gone, and the barrier won't bring it back. Therefore, if you need +to relying on control dependencies to produce this ordering, you should +make sure that MAX is greater than one, perhaps as follows: + + q = READ_ONCE(a); + BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */ + if (q % MAX) { + WRITE_ONCE(b, 1); + do_something(); + } else { + WRITE_ONCE(b, 2); + do_something_else(); + } + +Please note once again that each leg of the "if" statement absolutely +must store different values to "b". As in previous examples, if the two +values were identical, the compiler could pull this store outside of the +"if" statement, destroying the control dependency's ordering properties. + +You must also be careful avoid relying too much on boolean short-circuit +evaluation. Consider this example: + + q = READ_ONCE(a); + if (q || 1 > 0) + WRITE_ONCE(b, 1); + +Because the first condition cannot fault and the second condition is +always true, the compiler can transform this example as follows, again +destroying the control dependency's ordering: + + q = READ_ONCE(a); + WRITE_ONCE(b, 1); + +This is yet another example showing the importance of preventing the +compiler from out-guessing your code. Again, although READ_ONCE() really +does force the compiler to emit code for a given load, the compiler is +within its rights to discard the loaded value. + +In addition, control dependencies apply only to the then-clause and +else-clause of the "if" statement in question. In particular, they do +not necessarily order the code following the entire "if" statement: + + q = READ_ONCE(a); + if (q) { + WRITE_ONCE(b, 1); + } else { + WRITE_ONCE(b, 2); + } + WRITE_ONCE(c, 1); /* BUG: No ordering against the read from "a". */ + +It is tempting to argue that there in fact is ordering because the +compiler cannot reorder volatile accesses and also cannot reorder +the writes to "b" with the condition. Unfortunately for this line +of reasoning, the compiler might compile the two writes to "b" as +conditional-move instructions, as in this fanciful pseudo-assembly +language: + + ld r1,a + cmp r1,$0 + cmov,ne r4,$1 + cmov,eq r4,$2 + st r4,b + st $1,c + +The control dependencies would then extend only to the pair of cmov +instructions and the store depending on them. This means that a weakly +ordered CPU would have no dependency of any sort between the load from +"a" and the store to "c". In short, control dependencies provide ordering +only to the stores in the then-clause and else-clause of the "if" statement +in question (including functions invoked by those two clauses), and not +to code following that "if" statement. + + +In summary: + + (*) Control dependencies can order prior loads against later stores. + However, they do *not* guarantee any other sort of ordering: + Not prior loads against later loads, nor prior stores against + later anything. If you need these other forms of ordering, use + smp_load_acquire(), smp_store_release(), or, in the case of prior + stores and later loads, smp_mb(). + + (*) If both legs of the "if" statement contain identical stores to + the same variable, then you must explicitly order those stores, + either by preceding both of them with smp_mb() or by using + smp_store_release(). Please note that it is *not* sufficient to use + barrier() at beginning and end of each leg of the "if" statement + because, as shown by the example above, optimizing compilers can + destroy the control dependency while respecting the letter of the + barrier() law. + + (*) Control dependencies require at least one run-time conditional + between the prior load and the subsequent store, and this + conditional must involve the prior load. If the compiler is able + to optimize the conditional away, it will have also optimized + away the ordering. Careful use of READ_ONCE() and WRITE_ONCE() + can help to preserve the needed conditional. + + (*) Control dependencies require that the compiler avoid reordering the + dependency into nonexistence. Careful use of READ_ONCE() or + atomic{,64}_read() can help to preserve your control dependency. + + (*) Control dependencies apply only to the then-clause and else-clause + of the "if" statement containing the control dependency, including + any functions that these two clauses call. Control dependencies + do *not* apply to code beyond the end of that "if" statement. + + (*) Control dependencies pair normally with other types of barriers. + + (*) Control dependencies do *not* provide multicopy atomicity. If you + need all the CPUs to agree on the ordering of a given store against + all other accesses, use smp_mb(). + + (*) Compilers do not understand control dependencies. It is therefore + your job to ensure that they do not break your code. diff --git a/tools/memory-model/Documentation/glossary.txt b/tools/memory-model/Documentation/glossary.txt new file mode 100644 index 000000000000..79acb75d56ea --- /dev/null +++ b/tools/memory-model/Documentation/glossary.txt @@ -0,0 +1,172 @@ +This document contains brief definitions of LKMM-related terms. Like most +glossaries, it is not intended to be read front to back (except perhaps +as a way of confirming a diagnosis of OCD), but rather to be searched +for specific terms. + + +Address Dependency: When the address of a later memory access is computed + based on the value returned by an earlier load, an "address + dependency" extends from that load extending to the later access. + Address dependencies are quite common in RCU read-side critical + sections: + + 1 rcu_read_lock(); + 2 p = rcu_dereference(gp); + 3 do_something(p->a); + 4 rcu_read_unlock(); + + In this case, because the address of "p->a" on line 3 is computed + from the value returned by the rcu_dereference() on line 2, the + address dependency extends from that rcu_dereference() to that + "p->a". In rare cases, optimizing compilers can destroy address + dependencies. Please see Documentation/RCU/rcu_dereference.txt + for more information. + + See also "Control Dependency" and "Data Dependency". + +Acquire: With respect to a lock, acquiring that lock, for example, + using spin_lock(). With respect to a non-lock shared variable, + a special operation that includes a load and which orders that + load before later memory references running on that same CPU. + An example special acquire operation is smp_load_acquire(), + but atomic_read_acquire() and atomic_xchg_acquire() also include + acquire loads. + + When an acquire load returns the value stored by a release store + to that same variable, then all operations preceding that store + happen before any operations following that load acquire. + + See also "Relaxed" and "Release". + +Coherence (co): When one CPU's store to a given variable overwrites + either the value from another CPU's store or some later value, + there is said to be a coherence link from the second CPU to + the first. + + It is also possible to have a coherence link within a CPU, which + is a "coherence internal" (coi) link. The term "coherence + external" (coe) link is used when it is necessary to exclude + the coi case. + + See also "From-reads" and "Reads-from". + +Control Dependency: When a later store's execution depends on a test + of a value computed from a value returned by an earlier load, + a "control dependency" extends from that load to that store. + For example: + + 1 if (READ_ONCE(x)) + 2 WRITE_ONCE(y, 1); + + Here, the control dependency extends from the READ_ONCE() on + line 1 to the WRITE_ONCE() on line 2. Control dependencies are + fragile, and can be easily destroyed by optimizing compilers. + Please see control-dependencies.txt for more information. + + See also "Address Dependency" and "Data Dependency". + +Cycle: Memory-barrier pairing is restricted to a pair of CPUs, as the + name suggests. And in a great many cases, a pair of CPUs is all + that is required. In other cases, the notion of pairing must be + extended to additional CPUs, and the result is called a "cycle". + In a cycle, each CPU's ordering interacts with that of the next: + + CPU 0 CPU 1 CPU 2 + WRITE_ONCE(x, 1); WRITE_ONCE(y, 1); WRITE_ONCE(z, 1); + smp_mb(); smp_mb(); smp_mb(); + r0 = READ_ONCE(y); r1 = READ_ONCE(z); r2 = READ_ONCE(x); + + CPU 0's smp_mb() interacts with that of CPU 1, which interacts + with that of CPU 2, which in turn interacts with that of CPU 0 + to complete the cycle. Because of the smp_mb() calls between + each pair of memory accesses, the outcome where r0, r1, and r2 + are all equal to zero is forbidden by LKMM. + + See also "Pairing". + +Data Dependency: When the data written by a later store is computed based + on the value returned by an earlier load, a "data dependency" + extends from that load to that later store. For example: + + 1 r1 = READ_ONCE(x); + 2 WRITE_ONCE(y, r1 + 1); + + In this case, the data dependency extends from the READ_ONCE() + on line 1 to the WRITE_ONCE() on line 2. Data dependencies are + fragile and can be easily destroyed by optimizing compilers. + Because optimizing compilers put a great deal of effort into + working out what values integer variables might have, this is + especially true in cases where the dependency is carried through + an integer. + + See also "Address Dependency" and "Control Dependency". + +From-Reads (fr): When one CPU's store to a given variable happened + too late to affect the value returned by another CPU's + load from that same variable, there is said to be a from-reads + link from the load to the store. + + It is also possible to have a from-reads link within a CPU, which + is a "from-reads internal" (fri) link. The term "from-reads + external" (fre) link is used when it is necessary to exclude + the fri case. + + See also "Coherence" and "Reads-from". + +Fully Ordered: An operation such as smp_mb() that orders all of + its CPU's prior accesses with all of that CPU's subsequent + accesses, or a marked access such as atomic_add_return() + that orders all of its CPU's prior accesses, itself, and + all of its CPU's subsequent accesses. + +Marked Access: An access to a variable that uses an special function or + macro such as "r1 = READ_ONCE(x)" or "smp_store_release(&a, 1)". + + See also "Unmarked Access". + +Pairing: "Memory-barrier pairing" reflects the fact that synchronizing + data between two CPUs requires that both CPUs their accesses. + Memory barriers thus tend to come in pairs, one executed by + one of the CPUs and the other by the other CPU. Of course, + pairing also occurs with other types of operations, so that a + smp_store_release() pairs with an smp_load_acquire() that reads + the value stored. + + See also "Cycle". + +Reads-From (rf): When one CPU's load returns the value stored by some other + CPU, there is said to be a reads-from link from the second + CPU's store to the first CPU's load. Reads-from links have the + nice property that time must advance from the store to the load, + which means that algorithms using reads-from links can use lighter + weight ordering and synchronization compared to algorithms using + coherence and from-reads links. + + It is also possible to have a reads-from link within a CPU, which + is a "reads-from internal" (rfi) link. The term "reads-from + external" (rfe) link is used when it is necessary to exclude + the rfi case. + + See also Coherence" and "From-reads". + +Relaxed: A marked access that does not imply ordering, for example, a + READ_ONCE(), WRITE_ONCE(), a non-value-returning read-modify-write + operation, or a value-returning read-modify-write operation whose + name ends in "_relaxed". + + See also "Acquire" and "Release". + +Release: With respect to a lock, releasing that lock, for example, + using spin_unlock(). With respect to a non-lock shared variable, + a special operation that includes a store and which orders that + store after earlier memory references that ran on that same CPU. + An example special release store is smp_store_release(), but + atomic_set_release() and atomic_cmpxchg_release() also include + release stores. + + See also "Acquire" and "Relaxed". + +Unmarked Access: An access to a variable that uses normal C-language + syntax, for example, "a = b[2]"; + + See also "Marked Access". diff --git a/tools/memory-model/Documentation/litmus-tests.txt b/tools/memory-model/Documentation/litmus-tests.txt index 2f840dcd15cf..8a9d5d2787f9 100644 --- a/tools/memory-model/Documentation/litmus-tests.txt +++ b/tools/memory-model/Documentation/litmus-tests.txt @@ -946,6 +946,23 @@ Limitations of the Linux-kernel memory model (LKMM) include: carrying a dependency, then the compiler can break that dependency by substituting a constant of that value. + Conversely, LKMM sometimes doesn't recognize that a particular + optimization is not allowed, and as a result, thinks that a + dependency is not present (because the optimization would break it). + The memory model misses some pretty obvious control dependencies + because of this limitation. A simple example is: + + r1 = READ_ONCE(x); + if (r1 == 0) + smp_mb(); + WRITE_ONCE(y, 1); + + There is a control dependency from the READ_ONCE to the WRITE_ONCE, + even when r1 is nonzero, but LKMM doesn't realize this and thinks + that the write may execute before the read if r1 != 0. (Yes, that + doesn't make sense if you think about it, but the memory model's + intelligence is limited.) + 2. Multiple access sizes for a single variable are not supported, and neither are misaligned or partially overlapping accesses. diff --git a/tools/memory-model/Documentation/ordering.txt b/tools/memory-model/Documentation/ordering.txt new file mode 100644 index 000000000000..9b0949d3f5ec --- /dev/null +++ b/tools/memory-model/Documentation/ordering.txt @@ -0,0 +1,556 @@ +This document gives an overview of the categories of memory-ordering +operations provided by the Linux-kernel memory model (LKMM). + + +Categories of Ordering +====================== + +This section lists LKMM's three top-level categories of memory-ordering +operations in decreasing order of strength: + +1. Barriers (also known as "fences"). A barrier orders some or + all of the CPU's prior operations against some or all of its + subsequent operations. + +2. Ordered memory accesses. These operations order themselves + against some or all of the CPU's prior accesses or some or all + of the CPU's subsequent accesses, depending on the subcategory + of the operation. + +3. Unordered accesses, as the name indicates, have no ordering + properties except to the extent that they interact with an + operation in the previous categories. This being the real world, + some of these "unordered" operations provide limited ordering + in some special situations. + +Each of the above categories is described in more detail by one of the +following sections. + + +Barriers +======== + +Each of the following categories of barriers is described in its own +subsection below: + +a. Full memory barriers. + +b. Read-modify-write (RMW) ordering augmentation barriers. + +c. Write memory barrier. + +d. Read memory barrier. + +e. Compiler barrier. + +Note well that many of these primitives generate absolutely no code +in kernels built with CONFIG_SMP=n. Therefore, if you are writing +a device driver, which must correctly order accesses to a physical +device even in kernels built with CONFIG_SMP=n, please use the +ordering primitives provided for that purpose. For example, instead of +smp_mb(), use mb(). See the "Linux Kernel Device Drivers" book or the +https://lwn.net/Articles/698014/ article for more information. + + +Full Memory Barriers +-------------------- + +The Linux-kernel primitives that provide full ordering include: + +o The smp_mb() full memory barrier. + +o Value-returning RMW atomic operations whose names do not end in + _acquire, _release, or _relaxed. + +o RCU's grace-period primitives. + +First, the smp_mb() full memory barrier orders all of the CPU's prior +accesses against all subsequent accesses from the viewpoint of all CPUs. +In other words, all CPUs will agree that any earlier action taken +by that CPU happened before any later action taken by that same CPU. +For example, consider the following: + + WRITE_ONCE(x, 1); + smp_mb(); // Order store to x before load from y. + r1 = READ_ONCE(y); + +All CPUs will agree that the store to "x" happened before the load +from "y", as indicated by the comment. And yes, please comment your +memory-ordering primitives. It is surprisingly hard to remember their +purpose after even a few months. + +Second, some RMW atomic operations provide full ordering. These +operations include value-returning RMW atomic operations (that is, those +with non-void return types) whose names do not end in _acquire, _release, +or _relaxed. Examples include atomic_add_return(), atomic_dec_and_test(), +cmpxchg(), and xchg(). Note that conditional RMW atomic operations such +as cmpxchg() are only guaranteed to provide ordering when they succeed. +When RMW atomic operations provide full ordering, they partition the +CPU's accesses into three groups: + +1. All code that executed prior to the RMW atomic operation. + +2. The RMW atomic operation itself. + +3. All code that executed after the RMW atomic operation. + +All CPUs will agree that any operation in a given partition happened +before any operation in a higher-numbered partition. + +In contrast, non-value-returning RMW atomic operations (that is, those +with void return types) do not guarantee any ordering whatsoever. Nor do +value-returning RMW atomic operations whose names end in _relaxed. +Examples of the former include atomic_inc() and atomic_dec(), +while examples of the latter include atomic_cmpxchg_relaxed() and +atomic_xchg_relaxed(). Similarly, value-returning non-RMW atomic +operations such as atomic_read() do not guarantee full ordering, and +are covered in the later section on unordered operations. + +Value-returning RMW atomic operations whose names end in _acquire or +_release provide limited ordering, and will be described later in this +document. + +Finally, RCU's grace-period primitives provide full ordering. These +primitives include synchronize_rcu(), synchronize_rcu_expedited(), +synchronize_srcu() and so on. However, these primitives have orders +of magnitude greater overhead than smp_mb(), atomic_xchg(), and so on. +Furthermore, RCU's grace-period primitives can only be invoked in +sleepable contexts. Therefore, RCU's grace-period primitives are +typically instead used to provide ordering against RCU read-side critical +sections, as documented in their comment headers. But of course if you +need a synchronize_rcu() to interact with readers, it costs you nothing +to also rely on its additional full-memory-barrier semantics. Just please +carefully comment this, otherwise your future self will hate you. + + +RMW Ordering Augmentation Barriers +---------------------------------- + +As noted in the previous section, non-value-returning RMW operations +such as atomic_inc() and atomic_dec() guarantee no ordering whatsoever. +Nevertheless, a number of popular CPU families, including x86, provide +full ordering for these primitives. One way to obtain full ordering on +all architectures is to add a call to smp_mb(): + + WRITE_ONCE(x, 1); + atomic_inc(&my_counter); + smp_mb(); // Inefficient on x86!!! + r1 = READ_ONCE(y); + +This works, but the added smp_mb() adds needless overhead for +x86, on which atomic_inc() provides full ordering all by itself. +The smp_mb__after_atomic() primitive can be used instead: + + WRITE_ONCE(x, 1); + atomic_inc(&my_counter); + smp_mb__after_atomic(); // Order store to x before load from y. + r1 = READ_ONCE(y); + +The smp_mb__after_atomic() primitive emits code only on CPUs whose +atomic_inc() implementations do not guarantee full ordering, thus +incurring no unnecessary overhead on x86. There are a number of +variations on the smp_mb__*() theme: + +o smp_mb__before_atomic(), which provides full ordering prior + to an unordered RMW atomic operation. + +o smp_mb__after_atomic(), which, as shown above, provides full + ordering subsequent to an unordered RMW atomic operation. + +o smp_mb__after_spinlock(), which provides full ordering subsequent + to a successful spinlock acquisition. Note that spin_lock() is + always successful but spin_trylock() might not be. + +o smp_mb__after_srcu_read_unlock(), which provides full ordering + subsequent to an srcu_read_unlock(). + +It is bad practice to place code between the smp__*() primitive and the +operation whose ordering that it is augmenting. The reason is that the +ordering of this intervening code will differ from one CPU architecture +to another. + + +Write Memory Barrier +-------------------- + +The Linux kernel's write memory barrier is smp_wmb(). If a CPU executes +the following code: + + WRITE_ONCE(x, 1); + smp_wmb(); + WRITE_ONCE(y, 1); + +Then any given CPU will see the write to "x" has having happened before +the write to "y". However, you are usually better off using a release +store, as described in the "Release Operations" section below. + +Note that smp_wmb() might fail to provide ordering for unmarked C-language +stores because profile-driven optimization could determine that the +value being overwritten is almost always equal to the new value. Such a +compiler might then reasonably decide to transform "x = 1" and "y = 1" +as follows: + + if (x != 1) + x = 1; + smp_wmb(); // BUG: does not order the reads!!! + if (y != 1) + y = 1; + +Therefore, if you need to use smp_wmb() with unmarked C-language writes, +you will need to make sure that none of the compilers used to build +the Linux kernel carry out this sort of transformation, both now and in +the future. + + +Read Memory Barrier +------------------- + +The Linux kernel's read memory barrier is smp_rmb(). If a CPU executes +the following code: + + r0 = READ_ONCE(y); + smp_rmb(); + r1 = READ_ONCE(x); + +Then any given CPU will see the read from "y" as having preceded the read from +"x". However, you are usually better off using an acquire load, as described +in the "Acquire Operations" section below. + +Compiler Barrier +---------------- + +The Linux kernel's compiler barrier is barrier(). This primitive +prohibits compiler code-motion optimizations that might move memory +references across the point in the code containing the barrier(), but +does not constrain hardware memory ordering. For example, this can be +used to prevent to compiler from moving code across an infinite loop: + + WRITE_ONCE(x, 1); + while (dontstop) + barrier(); + r1 = READ_ONCE(y); + +Without the barrier(), the compiler would be within its rights to move the +WRITE_ONCE() to follow the loop. This code motion could be problematic +in the case where an interrupt handler terminates the loop. Another way +to handle this is to use READ_ONCE() for the load of "dontstop". + +Note that the barriers discussed previously use barrier() or its low-level +equivalent in their implementations. + + +Ordered Memory Accesses +======================= + +The Linux kernel provides a wide variety of ordered memory accesses: + +a. Release operations. + +b. Acquire operations. + +c. RCU read-side ordering. + +d. Control dependencies. + +Each of the above categories has its own section below. + + +Release Operations +------------------ + +Release operations include smp_store_release(), atomic_set_release(), +rcu_assign_pointer(), and value-returning RMW operations whose names +end in _release. These operations order their own store against all +of the CPU's prior memory accesses. Release operations often provide +improved readability and performance compared to explicit barriers. +For example, use of smp_store_release() saves a line compared to the +smp_wmb() example above: + + WRITE_ONCE(x, 1); + smp_store_release(&y, 1); + +More important, smp_store_release() makes it easier to connect up the +different pieces of the concurrent algorithm. The variable stored to +by the smp_store_release(), in this case "y", will normally be used in +an acquire operation in other parts of the concurrent algorithm. + +To see the performance advantages, suppose that the above example read +from "x" instead of writing to it. Then an smp_wmb() could not guarantee +ordering, and an smp_mb() would be needed instead: + + r1 = READ_ONCE(x); + smp_mb(); + WRITE_ONCE(y, 1); + +But smp_mb() often incurs much higher overhead than does +smp_store_release(), which still provides the needed ordering of "x" +against "y". On x86, the version using smp_store_release() might compile +to a simple load instruction followed by a simple store instruction. +In contrast, the smp_mb() compiles to an expensive instruction that +provides the needed ordering. + +There is a wide variety of release operations: + +o Store operations, including not only the aforementioned + smp_store_release(), but also atomic_set_release(), and + atomic_long_set_release(). + +o RCU's rcu_assign_pointer() operation. This is the same as + smp_store_release() except that: (1) It takes the pointer to + be assigned to instead of a pointer to that pointer, (2) It + is intended to be used in conjunction with rcu_dereference() + and similar rather than smp_load_acquire(), and (3) It checks + for an RCU-protected pointer in "sparse" runs. + +o Value-returning RMW operations whose names end in _release, + such as atomic_fetch_add_release() and cmpxchg_release(). + Note that release ordering is guaranteed only against the + memory-store portion of the RMW operation, and not against the + memory-load portion. Note also that conditional operations such + as cmpxchg_release() are only guaranteed to provide ordering + when they succeed. + +As mentioned earlier, release operations are often paired with acquire +operations, which are the subject of the next section. + + +Acquire Operations +------------------ + +Acquire operations include smp_load_acquire(), atomic_read_acquire(), +and value-returning RMW operations whose names end in _acquire. These +operations order their own load against all of the CPU's subsequent +memory accesses. Acquire operations often provide improved performance +and readability compared to explicit barriers. For example, use of +smp_load_acquire() saves a line compared to the smp_rmb() example above: + + r0 = smp_load_acquire(&y); + r1 = READ_ONCE(x); + +As with smp_store_release(), this also makes it easier to connect +the different pieces of the concurrent algorithm by looking for the +smp_store_release() that stores to "y". In addition, smp_load_acquire() +improves upon smp_rmb() by ordering against subsequent stores as well +as against subsequent loads. + +There are a couple of categories of acquire operations: + +o Load operations, including not only the aforementioned + smp_load_acquire(), but also atomic_read_acquire(), and + atomic64_read_acquire(). + +o Value-returning RMW operations whose names end in _acquire, + such as atomic_xchg_acquire() and atomic_cmpxchg_acquire(). + Note that acquire ordering is guaranteed only against the + memory-load portion of the RMW operation, and not against the + memory-store portion. Note also that conditional operations + such as atomic_cmpxchg_acquire() are only guaranteed to provide + ordering when they succeed. + +Symmetry being what it is, acquire operations are often paired with the +release operations covered earlier. For example, consider the following +example, where task0() and task1() execute concurrently: + + void task0(void) + { + WRITE_ONCE(x, 1); + smp_store_release(&y, 1); + } + + void task1(void) + { + r0 = smp_load_acquire(&y); + r1 = READ_ONCE(x); + } + +If "x" and "y" are both initially zero, then either r0's final value +will be zero or r1's final value will be one, thus providing the required +ordering. + + +RCU Read-Side Ordering +---------------------- + +This category includes read-side markers such as rcu_read_lock() +and rcu_read_unlock() as well as pointer-traversal primitives such as +rcu_dereference() and srcu_dereference(). + +Compared to locking primitives and RMW atomic operations, markers +for RCU read-side critical sections incur very low overhead because +they interact only with the corresponding grace-period primitives. +For example, the rcu_read_lock() and rcu_read_unlock() markers interact +with synchronize_rcu(), synchronize_rcu_expedited(), and call_rcu(). +The way this works is that if a given call to synchronize_rcu() cannot +prove that it started before a given call to rcu_read_lock(), then +that synchronize_rcu() must block until the matching rcu_read_unlock() +is reached. For more information, please see the synchronize_rcu() +docbook header comment and the material in Documentation/RCU. + +RCU's pointer-traversal primitives, including rcu_dereference() and +srcu_dereference(), order their load (which must be a pointer) against any +of the CPU's subsequent memory accesses whose address has been calculated +from the value loaded. There is said to be an *address dependency* +from the value returned by the rcu_dereference() or srcu_dereference() +to that subsequent memory access. + +A call to rcu_dereference() for a given RCU-protected pointer is +usually paired with a call to a call to rcu_assign_pointer() for that +same pointer in much the same way that a call to smp_load_acquire() is +paired with a call to smp_store_release(). Calls to rcu_dereference() +and rcu_assign_pointer are often buried in other APIs, for example, +the RCU list API members defined in include/linux/rculist.h. For more +information, please see the docbook headers in that file, the most +recent LWN article on the RCU API (https://lwn.net/Articles/777036/), +and of course the material in Documentation/RCU. + +If the pointer value is manipulated between the rcu_dereference() +that returned it and a later dereference(), please read +Documentation/RCU/rcu_dereference.rst. It can also be quite helpful to +review uses in the Linux kernel. + + +Control Dependencies +-------------------- + +A control dependency extends from a marked load (READ_ONCE() or stronger) +through an "if" condition to a marked store (WRITE_ONCE() or stronger) +that is executed only by one of the legs of that "if" statement. +Control dependencies are so named because they are mediated by +control-flow instructions such as comparisons and conditional branches. + +In short, you can use a control dependency to enforce ordering between +an READ_ONCE() and a WRITE_ONCE() when there is an "if" condition +between them. The canonical example is as follows: + + q = READ_ONCE(a); + if (q) + WRITE_ONCE(b, 1); + +In this case, all CPUs would see the read from "a" as happening before +the write to "b". + +However, control dependencies are easily destroyed by compiler +optimizations, so any use of control dependencies must take into account +all of the compilers used to build the Linux kernel. Please see the +"control-dependencies.txt" file for more information. + + +Unordered Accesses +================== + +Each of these two categories of unordered accesses has a section below: + +a. Unordered marked operations. + +b. Unmarked C-language accesses. + + +Unordered Marked Operations +--------------------------- + +Unordered operations to different variables are just that, unordered. +However, if a group of CPUs apply these operations to a single variable, +all the CPUs will agree on the operation order. Of course, the ordering +of unordered marked accesses can also be constrained using the mechanisms +described earlier in this document. + +These operations come in three categories: + +o Marked writes, such as WRITE_ONCE() and atomic_set(). These + primitives required the compiler to emit the corresponding store + instructions in the expected execution order, thus suppressing + a number of destructive optimizations. However, they provide no + hardware ordering guarantees, and in fact many CPUs will happily + reorder marked writes with each other or with other unordered + operations, unless these operations are to the same variable. + +o Marked reads, such as READ_ONCE() and atomic_read(). These + primitives required the compiler to emit the corresponding load + instructions in the expected execution order, thus suppressing + a number of destructive optimizations. However, they provide no + hardware ordering guarantees, and in fact many CPUs will happily + reorder marked reads with each other or with other unordered + operations, unless these operations are to the same variable. + +o Unordered RMW atomic operations. These are non-value-returning + RMW atomic operations whose names do not end in _acquire or + _release, and also value-returning RMW operations whose names + end in _relaxed. Examples include atomic_add(), atomic_or(), + and atomic64_fetch_xor_relaxed(). These operations do carry + out the specified RMW operation atomically, for example, five + concurrent atomic_inc() operations applied to a given variable + will reliably increase the value of that variable by five. + However, many CPUs will happily reorder these operations with + each other or with other unordered operations. + + This category of operations can be efficiently ordered using + smp_mb__before_atomic() and smp_mb__after_atomic(), as was + discussed in the "RMW Ordering Augmentation Barriers" section. + +In short, these operations can be freely reordered unless they are all +operating on a single variable or unless they are constrained by one of +the operations called out earlier in this document. + + +Unmarked C-Language Accesses +---------------------------- + +Unmarked C-language accesses are normal variable accesses to normal +variables, that is, to variables that are not "volatile" and are not +C11 atomic variables. These operations provide no ordering guarantees, +and further do not guarantee "atomic" access. For example, the compiler +might (and sometimes does) split a plain C-language store into multiple +smaller stores. A load from that same variable running on some other +CPU while such a store is executing might see a value that is a mashup +of the old value and the new value. + +Unmarked C-language accesses are unordered, and are also subject to +any number of compiler optimizations, many of which can break your +concurrent code. It is possible to used unmarked C-language accesses for +shared variables that are subject to concurrent access, but great care +is required on an ongoing basis. The compiler-constraining barrier() +primitive can be helpful, as can the various ordering primitives discussed +in this document. It nevertheless bears repeating that use of unmarked +C-language accesses requires careful attention to not just your code, +but to all the compilers that might be used to build it. Such compilers +might replace a series of loads with a single load, and might replace +a series of stores with a single store. Some compilers will even split +a single store into multiple smaller stores. + +But there are some ways of using unmarked C-language accesses for shared +variables without such worries: + +o Guard all accesses to a given variable by a particular lock, + so that there are never concurrent conflicting accesses to + that variable. (There are "conflicting accesses" when + (1) at least one of the concurrent accesses to a variable is an + unmarked C-language access and (2) when at least one of those + accesses is a write, whether marked or not.) + +o As above, but using other synchronization primitives such + as reader-writer locks or sequence locks. + +o Use locking or other means to ensure that all concurrent accesses + to a given variable are reads. + +o Restrict use of a given variable to statistics or heuristics + where the occasional bogus value can be tolerated. + +o Declare the accessed variables as C11 atomics. + https://lwn.net/Articles/691128/ + +o Declare the accessed variables as "volatile". + +If you need to live more dangerously, please do take the time to +understand the compilers. One place to start is these two LWN +articles: + +Who's afraid of a big bad optimizing compiler? + https://lwn.net/Articles/793253 +Calibrating your fear of big bad optimizing compilers + https://lwn.net/Articles/799218 + +Used properly, unmarked C-language accesses can reduce overhead on +fastpaths. However, the price is great care and continual attention +to your compiler as new versions come out and as new optimizations +are enabled. diff --git a/tools/memory-model/README b/tools/memory-model/README index c8144d4aafa0..39d08d1f0443 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -161,26 +161,8 @@ running LKMM litmus tests. DESCRIPTION OF FILES ==================== -Documentation/cheatsheet.txt - Quick-reference guide to the Linux-kernel memory model. - -Documentation/explanation.txt - Describes the memory model in detail. - -Documentation/litmus-tests.txt - Describes the format, features, capabilities, and limitations - of the litmus tests that LKMM can evaluate. - -Documentation/recipes.txt - Lists common memory-ordering patterns. - -Documentation/references.txt - Provides background reading. - -Documentation/simple.txt - Starting point for someone new to Linux-kernel concurrency. - And also for those needing a reminder of the simpler approaches - to concurrency! +Documentation/README + Guide to the other documents in the Documentation/ directory. linux-kernel.bell Categorizes the relevant instructions, including memory diff --git a/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus b/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus index 967f9f2a6226..772544f03fb5 100644 --- a/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus +++ b/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus @@ -7,7 +7,9 @@ C CoRR+poonceonce+Once * reads from the same variable are ordered. *) -{} +{ + int x; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus b/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus index 4635739f3974..5faae98f7ffb 100644 --- a/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus +++ b/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus @@ -7,7 +7,9 @@ C CoRW+poonceonce+Once * a given variable and a later write to that same variable are ordered. *) -{} +{ + int x; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus b/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus index bb068c92d8da..77c9cc9f8dc6 100644 --- a/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus +++ b/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus @@ -7,7 +7,9 @@ C CoWR+poonceonce+Once * given variable and a later read from that same variable are ordered. *) -{} +{ + int x; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus b/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus index 0d9f0a958799..85ef746f511a 100644 --- a/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus +++ b/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus @@ -7,7 +7,9 @@ C CoWW+poonceonce * writes to the same variable are ordered. *) -{} +{ + int x; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus b/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus index e729d2776e89..87aa900125ab 100644 --- a/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus +++ b/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus @@ -10,7 +10,10 @@ C IRIW+fencembonceonces+OnceOnce * process? This litmus test exercises LKMM's "propagation" rule. *) -{} +{ + int x; + int y; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus b/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus index 4b54dd6a6cd9..f84022dca555 100644 --- a/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus +++ b/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus @@ -10,7 +10,10 @@ C IRIW+poonceonces+OnceOnce * different process? *) -{} +{ + int x; + int y; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus b/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus index 094d58df7789..398f624daa77 100644 --- a/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus +++ b/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus @@ -7,7 +7,12 @@ C ISA2+pooncelock+pooncelock+pombonce * (in P0() and P1()) is visible to external process P2(). *) -{} +{ + spinlock_t mylock; + int x; + int y; + int z; +} P0(int *x, int *y, spinlock_t *mylock) { diff --git a/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus b/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus index b321aa6f4ea5..212a432ba16b 100644 --- a/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus @@ -9,7 +9,11 @@ C ISA2+poonceonces * of the smp_load_acquire() invocations are replaced by READ_ONCE()? *) -{} +{ + int x; + int y; + int z; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus b/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus index 025b0462ec9b..7afd85672ccd 100644 --- a/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus +++ b/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus @@ -11,7 +11,11 @@ C ISA2+pooncerelease+poacquirerelease+poacquireonce * (AKA non-rf) link, so release-acquire is all that is needed. *) -{} +{ + int x; + int y; + int z; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus b/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus index 4727f5aaf03b..c8a93c7ee556 100644 --- a/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus +++ b/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus @@ -11,7 +11,10 @@ C LB+fencembonceonce+ctrlonceonce * another control dependency and order would still be maintained.) *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus b/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus index 07b9904b0e49..2fa029568fa1 100644 --- a/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus +++ b/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus @@ -8,7 +8,10 @@ C LB+poacquireonce+pooncerelease * to the other? *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/LB+poonceonces.litmus b/tools/memory-model/litmus-tests/LB+poonceonces.litmus index 74c49cb3c37b..2107306e8625 100644 --- a/tools/memory-model/litmus-tests/LB+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/LB+poonceonces.litmus @@ -7,7 +7,10 @@ C LB+poonceonces * be prevented even with no explicit ordering? *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus b/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus index a273da9faa6d..c5c168d92973 100644 --- a/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus +++ b/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus @@ -8,23 +8,26 @@ C MP+fencewmbonceonce+fencermbonceonce * is usually better to use smp_store_release() and smp_load_acquire(). *) -{} +{ + int buf; + int flag; +} -P0(int *x, int *y) +P0(int *buf, int *flag) // Producer { - WRITE_ONCE(*x, 1); + WRITE_ONCE(*buf, 1); smp_wmb(); - WRITE_ONCE(*y, 1); + WRITE_ONCE(*flag, 1); } -P1(int *x, int *y) +P1(int *buf, int *flag) // Consumer { int r0; int r1; - r0 = READ_ONCE(*y); + r0 = READ_ONCE(*flag); smp_rmb(); - r1 = READ_ONCE(*x); + r1 = READ_ONCE(*buf); } -exists (1:r0=1 /\ 1:r1=0) +exists (1:r0=1 /\ 1:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus b/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus index 97731b4bbdd8..20ff62649f1e 100644 --- a/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus +++ b/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus @@ -10,25 +10,26 @@ C MP+onceassign+derefonce *) { -y=z; -z=0; + int *p=y; + int x; + int y=0; } -P0(int *x, int **y) +P0(int *x, int **p) // Producer { WRITE_ONCE(*x, 1); - rcu_assign_pointer(*y, x); + rcu_assign_pointer(*p, x); } -P1(int *x, int **y) +P1(int *x, int **p) // Consumer { int *r0; int r1; rcu_read_lock(); - r0 = rcu_dereference(*y); + r0 = rcu_dereference(*p); r1 = READ_ONCE(*r0); rcu_read_unlock(); } -exists (1:r0=x /\ 1:r1=0) +exists (1:r0=x /\ 1:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus b/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus index 50f4d62bbf0e..153917ad5dc9 100644 --- a/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus +++ b/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus @@ -11,9 +11,11 @@ C MP+polockmbonce+poacquiresilsil *) { + spinlock_t lo; + int x; } -P0(spinlock_t *lo, int *x) +P0(spinlock_t *lo, int *x) // Producer { spin_lock(lo); smp_mb__after_spinlock(); @@ -21,7 +23,7 @@ P0(spinlock_t *lo, int *x) spin_unlock(lo); } -P1(spinlock_t *lo, int *x) +P1(spinlock_t *lo, int *x) // Consumer { int r1; int r2; @@ -32,4 +34,4 @@ P1(spinlock_t *lo, int *x) r3 = spin_is_locked(lo); } -exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1) +exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus b/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus index abf81e7a0895..aad64397bb8c 100644 --- a/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus +++ b/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus @@ -11,16 +11,18 @@ C MP+polockonce+poacquiresilsil *) { + spinlock_t lo; + int x; } -P0(spinlock_t *lo, int *x) +P0(spinlock_t *lo, int *x) // Producer { spin_lock(lo); WRITE_ONCE(*x, 1); spin_unlock(lo); } -P1(spinlock_t *lo, int *x) +P1(spinlock_t *lo, int *x) // Consumer { int r1; int r2; @@ -31,4 +33,4 @@ P1(spinlock_t *lo, int *x) r3 = spin_is_locked(lo); } -exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1) +exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+polocks.litmus b/tools/memory-model/litmus-tests/MP+polocks.litmus index 712a4fcdf6ce..21cbca6f3be4 100644 --- a/tools/memory-model/litmus-tests/MP+polocks.litmus +++ b/tools/memory-model/litmus-tests/MP+polocks.litmus @@ -11,25 +11,29 @@ C MP+polocks * to see all prior accesses by those other CPUs. *) -{} +{ + spinlock_t mylock; + int buf; + int flag; +} -P0(int *x, int *y, spinlock_t *mylock) +P0(int *buf, int *flag, spinlock_t *mylock) // Producer { - WRITE_ONCE(*x, 1); + WRITE_ONCE(*buf, 1); spin_lock(mylock); - WRITE_ONCE(*y, 1); + WRITE_ONCE(*flag, 1); spin_unlock(mylock); } -P1(int *x, int *y, spinlock_t *mylock) +P1(int *buf, int *flag, spinlock_t *mylock) // Consumer { int r0; int r1; spin_lock(mylock); - r0 = READ_ONCE(*y); + r0 = READ_ONCE(*flag); spin_unlock(mylock); - r1 = READ_ONCE(*x); + r1 = READ_ONCE(*buf); } -exists (1:r0=1 /\ 1:r1=0) +exists (1:r0=1 /\ 1:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+poonceonces.litmus b/tools/memory-model/litmus-tests/MP+poonceonces.litmus index 172f0145301c..9f9769d647c7 100644 --- a/tools/memory-model/litmus-tests/MP+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/MP+poonceonces.litmus @@ -7,21 +7,24 @@ C MP+poonceonces * no ordering at all? *) -{} +{ + int buf; + int flag; +} -P0(int *x, int *y) +P0(int *buf, int *flag) // Producer { - WRITE_ONCE(*x, 1); - WRITE_ONCE(*y, 1); + WRITE_ONCE(*buf, 1); + WRITE_ONCE(*flag, 1); } -P1(int *x, int *y) +P1(int *buf, int *flag) // Consumer { int r0; int r1; - r0 = READ_ONCE(*y); - r1 = READ_ONCE(*x); + r0 = READ_ONCE(*flag); + r1 = READ_ONCE(*buf); } -exists (1:r0=1 /\ 1:r1=0) +exists (1:r0=1 /\ 1:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus b/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus index d52c68429722..cbe28e733443 100644 --- a/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus +++ b/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus @@ -8,21 +8,24 @@ C MP+pooncerelease+poacquireonce * pattern. *) -{} +{ + int buf; + int flag; +} -P0(int *x, int *y) +P0(int *buf, int *flag) // Producer { - WRITE_ONCE(*x, 1); - smp_store_release(y, 1); + WRITE_ONCE(*buf, 1); + smp_store_release(flag, 1); } -P1(int *x, int *y) +P1(int *buf, int *flag) // Consumer { int r0; int r1; - r0 = smp_load_acquire(y); - r1 = READ_ONCE(*x); + r0 = smp_load_acquire(flag); + r1 = READ_ONCE(*buf); } -exists (1:r0=1 /\ 1:r1=0) +exists (1:r0=1 /\ 1:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+porevlocks.litmus b/tools/memory-model/litmus-tests/MP+porevlocks.litmus index 72c9276b363e..012041bd4feb 100644 --- a/tools/memory-model/litmus-tests/MP+porevlocks.litmus +++ b/tools/memory-model/litmus-tests/MP+porevlocks.litmus @@ -11,25 +11,29 @@ C MP+porevlocks * see all prior accesses by those other CPUs. *) -{} +{ + spinlock_t mylock; + int buf; + int flag; +} -P0(int *x, int *y, spinlock_t *mylock) +P0(int *buf, int *flag, spinlock_t *mylock) // Consumer { int r0; int r1; - r0 = READ_ONCE(*y); + r0 = READ_ONCE(*flag); spin_lock(mylock); - r1 = READ_ONCE(*x); + r1 = READ_ONCE(*buf); spin_unlock(mylock); } -P1(int *x, int *y, spinlock_t *mylock) +P1(int *buf, int *flag, spinlock_t *mylock) // Producer { spin_lock(mylock); - WRITE_ONCE(*x, 1); + WRITE_ONCE(*buf, 1); spin_unlock(mylock); - WRITE_ONCE(*y, 1); + WRITE_ONCE(*flag, 1); } -exists (0:r0=1 /\ 0:r1=0) +exists (0:r0=1 /\ 0:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/R+fencembonceonces.litmus b/tools/memory-model/litmus-tests/R+fencembonceonces.litmus index 222a0b850b4a..af9463b39b4a 100644 --- a/tools/memory-model/litmus-tests/R+fencembonceonces.litmus +++ b/tools/memory-model/litmus-tests/R+fencembonceonces.litmus @@ -9,7 +9,10 @@ C R+fencembonceonces * cause the resulting test to be allowed. *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/R+poonceonces.litmus b/tools/memory-model/litmus-tests/R+poonceonces.litmus index 5386f128a131..bcd5574e304a 100644 --- a/tools/memory-model/litmus-tests/R+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/R+poonceonces.litmus @@ -8,7 +8,10 @@ C R+poonceonces * store propagation delays. *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus b/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus index 18479823cd6c..c36341d1aed6 100644 --- a/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus +++ b/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus @@ -7,7 +7,10 @@ C S+fencewmbonceonce+poacquireonce * store against a subsequent store? *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/S+poonceonces.litmus b/tools/memory-model/litmus-tests/S+poonceonces.litmus index 8c9c2f81a580..7775c23143a0 100644 --- a/tools/memory-model/litmus-tests/S+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/S+poonceonces.litmus @@ -9,7 +9,10 @@ C S+poonceonces * READ_ONCE(), is ordering preserved? *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus b/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus index ed5fff18d223..833cdfeb7c09 100644 --- a/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus +++ b/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus @@ -9,7 +9,10 @@ C SB+fencembonceonces * suffice, but not much else.) *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/SB+poonceonces.litmus b/tools/memory-model/litmus-tests/SB+poonceonces.litmus index 10d550730b25..c92211ecbfdf 100644 --- a/tools/memory-model/litmus-tests/SB+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/SB+poonceonces.litmus @@ -8,7 +8,10 @@ C SB+poonceonces * variable that the preceding process reads. *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus b/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus index 04a16603660b..84344b455eb7 100644 --- a/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus +++ b/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus @@ -6,7 +6,10 @@ C SB+rfionceonce-poonceonces * This litmus test demonstrates that LKMM is not fully multicopy atomic. *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus b/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus index 6a2bc12a1af1..431494708611 100644 --- a/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus +++ b/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus @@ -8,7 +8,10 @@ C WRC+poonceonces+Once * test has no ordering at all. *) -{} +{ + int x; + int y; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus b/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus index e9947250d7de..554999c64db5 100644 --- a/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus +++ b/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus @@ -10,7 +10,10 @@ C WRC+pooncerelease+fencermbonceonce+Once * is A-cumulative in LKMM. *) -{} +{ + int x; + int y; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus b/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus index 415248fb6699..265a95ffef13 100644 --- a/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus +++ b/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus @@ -9,7 +9,12 @@ C Z6.0+pooncelock+poonceLock+pombonce * by CPUs not holding that lock. *) -{} +{ + spinlock_t mylock; + int x; + int y; + int z; +} P0(int *x, int *y, spinlock_t *mylock) { diff --git a/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus b/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus index 10a2aa04cd07..0c9aea8e80df 100644 --- a/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus +++ b/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus @@ -8,7 +8,12 @@ C Z6.0+pooncelock+pooncelock+pombonce * seen as ordered by a third process not holding that lock. *) -{} +{ + spinlock_t mylock; + int x; + int y; + int z; +} P0(int *x, int *y, spinlock_t *mylock) { diff --git a/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus b/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus index 88e70b87a683..661f9aaa5791 100644 --- a/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus +++ b/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus @@ -14,7 +14,11 @@ C Z6.0+pooncerelease+poacquirerelease+fencembonceonce * involving locking.) *) -{} +{ + int x; + int y; + int z; +} P0(int *x, int *y) { diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 4ea9a833dde7..5cdb19036d7f 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -3,15 +3,6 @@ include ../scripts/Makefile.include include ../scripts/Makefile.arch # always use the host compiler -ifneq ($(LLVM),) -HOSTAR ?= llvm-ar -HOSTCC ?= clang -HOSTLD ?= ld.lld -else -HOSTAR ?= ar -HOSTCC ?= gcc -HOSTLD ?= ld -endif AR = $(HOSTAR) CC = $(HOSTCC) LD = $(HOSTLD) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 7ce3f2e8b9c7..62f3deb1d3a8 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -175,10 +175,6 @@ endef LD += $(EXTRA_LDFLAGS) -HOSTCC ?= gcc -HOSTLD ?= ld -HOSTAR ?= ar - PKG_CONFIG = $(CROSS_COMPILE)pkg-config LLVM_CONFIG ?= llvm-config diff --git a/tools/power/acpi/Makefile.config b/tools/power/acpi/Makefile.config index 54a2857c2510..331f6d30f472 100644 --- a/tools/power/acpi/Makefile.config +++ b/tools/power/acpi/Makefile.config @@ -54,7 +54,6 @@ INSTALL_SCRIPT = ${INSTALL_PROGRAM} CROSS = #/usr/i386-linux-uclibc/usr/bin/i386-uclibc- CROSS_COMPILE ?= $(CROSS) LD = $(CC) -HOSTCC = gcc # check if compiler option is supported cc-supports = ${shell if $(CC) ${1} -S -o /dev/null -x c /dev/null > /dev/null 2>&1; then echo "$(1)"; fi;} diff --git a/tools/power/cpupower/lib/cpupower.c b/tools/power/cpupower/lib/cpupower.c index 3656e697537e..3f7d0c0c5067 100644 --- a/tools/power/cpupower/lib/cpupower.c +++ b/tools/power/cpupower/lib/cpupower.c @@ -16,8 +16,8 @@ unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen) { - int fd; ssize_t numread; + int fd; fd = open(path, O_RDONLY); if (fd == -1) @@ -35,6 +35,27 @@ unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen) return (unsigned int) numread; } +unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen) +{ + ssize_t numwritten; + int fd; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + if (numwritten < 1) { + perror(path); + close(fd); + return -1; + } + + close(fd); + + return (unsigned int) numwritten; +} + /* * Detect whether a CPU is online * diff --git a/tools/power/cpupower/lib/cpupower_intern.h b/tools/power/cpupower/lib/cpupower_intern.h index 4887c76d23f8..ac1112b956ec 100644 --- a/tools/power/cpupower/lib/cpupower_intern.h +++ b/tools/power/cpupower/lib/cpupower_intern.h @@ -1,6 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ #define PATH_TO_CPU "/sys/devices/system/cpu/" + +#ifndef MAX_LINE_LEN #define MAX_LINE_LEN 4096 +#endif + #define SYSFS_PATH_MAX 255 unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen); +unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen); diff --git a/tools/power/cpupower/utils/cpufreq-set.c b/tools/power/cpupower/utils/cpufreq-set.c index 7b2164e07057..c5e60a39cfa6 100644 --- a/tools/power/cpupower/utils/cpufreq-set.c +++ b/tools/power/cpupower/utils/cpufreq-set.c @@ -315,6 +315,7 @@ int cmd_freq_set(int argc, char **argv) } } + get_cpustate(); /* loop over CPUs */ for (cpu = bitmask_first(cpus_chosen); @@ -332,5 +333,7 @@ int cmd_freq_set(int argc, char **argv) } } + print_offline_cpus(); + return 0; } diff --git a/tools/power/cpupower/utils/cpuidle-set.c b/tools/power/cpupower/utils/cpuidle-set.c index 569f268f4c7f..46158928f9ad 100644 --- a/tools/power/cpupower/utils/cpuidle-set.c +++ b/tools/power/cpupower/utils/cpuidle-set.c @@ -95,6 +95,8 @@ int cmd_idle_set(int argc, char **argv) exit(EXIT_FAILURE); } + get_cpustate(); + /* Default is: set all CPUs */ if (bitmask_isallclear(cpus_chosen)) bitmask_setall(cpus_chosen); @@ -181,5 +183,7 @@ int cmd_idle_set(int argc, char **argv) break; } } + + print_offline_cpus(); return EXIT_SUCCESS; } diff --git a/tools/power/cpupower/utils/cpupower-info.c b/tools/power/cpupower/utils/cpupower-info.c index 0ba61a2c4d81..06345b543786 100644 --- a/tools/power/cpupower/utils/cpupower-info.c +++ b/tools/power/cpupower/utils/cpupower-info.c @@ -101,7 +101,7 @@ int cmd_info(int argc, char **argv) } if (params.perf_bias) { - ret = msr_intel_get_perf_bias(cpu); + ret = cpupower_intel_get_perf_bias(cpu); if (ret < 0) { fprintf(stderr, _("Could not read perf-bias value[%d]\n"), ret); diff --git a/tools/power/cpupower/utils/cpupower-set.c b/tools/power/cpupower/utils/cpupower-set.c index 052044d7e012..180d5ba877e6 100644 --- a/tools/power/cpupower/utils/cpupower-set.c +++ b/tools/power/cpupower/utils/cpupower-set.c @@ -95,7 +95,7 @@ int cmd_set(int argc, char **argv) } if (params.perf_bias) { - ret = msr_intel_set_perf_bias(cpu, perf_bias); + ret = cpupower_intel_set_perf_bias(cpu, perf_bias); if (ret) { fprintf(stderr, _("Error setting perf-bias " "value on CPU %d\n"), cpu); diff --git a/tools/power/cpupower/utils/cpupower.c b/tools/power/cpupower/utils/cpupower.c index 8e3d08042825..8ac3304a9957 100644 --- a/tools/power/cpupower/utils/cpupower.c +++ b/tools/power/cpupower/utils/cpupower.c @@ -34,6 +34,8 @@ int run_as_root; int base_cpu; /* Affected cpus chosen by -c/--cpu param */ struct bitmask *cpus_chosen; +struct bitmask *online_cpus; +struct bitmask *offline_cpus; #ifdef DEBUG int be_verbose; @@ -178,6 +180,8 @@ int main(int argc, const char *argv[]) char pathname[32]; cpus_chosen = bitmask_alloc(sysconf(_SC_NPROCESSORS_CONF)); + online_cpus = bitmask_alloc(sysconf(_SC_NPROCESSORS_CONF)); + offline_cpus = bitmask_alloc(sysconf(_SC_NPROCESSORS_CONF)); argc--; argv += 1; @@ -230,6 +234,10 @@ int main(int argc, const char *argv[]) ret = p->main(argc, argv); if (cpus_chosen) bitmask_free(cpus_chosen); + if (online_cpus) + bitmask_free(online_cpus); + if (offline_cpus) + bitmask_free(offline_cpus); return ret; } print_help(); diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index c258eeccd05f..0642e60a6ce1 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -94,6 +94,8 @@ struct cpupower_cpu_info { */ extern int get_cpu_info(struct cpupower_cpu_info *cpu_info); extern struct cpupower_cpu_info cpupower_cpu_info; + + /* cpuid and cpuinfo helpers **************************/ /* X86 ONLY ****************************************/ @@ -105,8 +107,8 @@ extern struct cpupower_cpu_info cpupower_cpu_info; extern int read_msr(int cpu, unsigned int idx, unsigned long long *val); extern int write_msr(int cpu, unsigned int idx, unsigned long long val); -extern int msr_intel_set_perf_bias(unsigned int cpu, unsigned int val); -extern int msr_intel_get_perf_bias(unsigned int cpu); +extern int cpupower_intel_set_perf_bias(unsigned int cpu, unsigned int val); +extern int cpupower_intel_get_perf_bias(unsigned int cpu); extern unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu); /* Read/Write msr ****************************/ @@ -150,9 +152,9 @@ static inline int read_msr(int cpu, unsigned int idx, unsigned long long *val) { return -1; }; static inline int write_msr(int cpu, unsigned int idx, unsigned long long val) { return -1; }; -static inline int msr_intel_set_perf_bias(unsigned int cpu, unsigned int val) +static inline int cpupower_intel_set_perf_bias(unsigned int cpu, unsigned int val) { return -1; }; -static inline int msr_intel_get_perf_bias(unsigned int cpu) +static inline int cpupower_intel_get_perf_bias(unsigned int cpu) { return -1; }; static inline unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu) { return 0; }; @@ -171,4 +173,14 @@ static inline unsigned int cpuid_ecx(unsigned int op) { return 0; }; static inline unsigned int cpuid_edx(unsigned int op) { return 0; }; #endif /* defined(__i386__) || defined(__x86_64__) */ +/* + * CPU State related functions + */ +extern struct bitmask *online_cpus; +extern struct bitmask *offline_cpus; + +void get_cpustate(void); +void print_online_cpus(void); +void print_offline_cpus(void); + #endif /* __CPUPOWERUTILS_HELPERS__ */ diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index f406adc40bad..650b9a9a6584 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -1,7 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 -#if defined(__i386__) || defined(__x86_64__) + +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> #include "helpers/helpers.h" +#include "helpers/sysfs.h" + +#if defined(__i386__) || defined(__x86_64__) + +#include "cpupower_intern.h" #define MSR_AMD_HWCR 0xc0010015 @@ -40,4 +48,104 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, *support = *active = 1; return 0; } + +int cpupower_intel_get_perf_bias(unsigned int cpu) +{ + char linebuf[MAX_LINE_LEN]; + char path[SYSFS_PATH_MAX]; + unsigned long val; + char *endp; + + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_PERF_BIAS)) + return -1; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/power/energy_perf_bias", cpu); + + if (cpupower_read_sysfs(path, linebuf, MAX_LINE_LEN) == 0) + return -1; + + val = strtol(linebuf, &endp, 0); + if (endp == linebuf || errno == ERANGE) + return -1; + + return val; +} + +int cpupower_intel_set_perf_bias(unsigned int cpu, unsigned int val) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[3] = {}; + + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_PERF_BIAS)) + return -1; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/power/energy_perf_bias", cpu); + snprintf(linebuf, sizeof(linebuf), "%d", val); + + if (cpupower_write_sysfs(path, linebuf, 3) <= 0) + return -1; + + return 0; +} + #endif /* #if defined(__i386__) || defined(__x86_64__) */ + +/* get_cpustate + * + * Gather the information of all online CPUs into bitmask struct + */ +void get_cpustate(void) +{ + unsigned int cpu = 0; + + bitmask_clearall(online_cpus); + bitmask_clearall(offline_cpus); + + for (cpu = bitmask_first(cpus_chosen); + cpu <= bitmask_last(cpus_chosen); cpu++) { + + if (cpupower_is_cpu_online(cpu) == 1) + bitmask_setbit(online_cpus, cpu); + else + bitmask_setbit(offline_cpus, cpu); + + continue; + } +} + +/* print_online_cpus + * + * Print the CPU numbers of all CPUs that are online currently + */ +void print_online_cpus(void) +{ + int str_len = 0; + char *online_cpus_str = NULL; + + str_len = online_cpus->size * 5; + online_cpus_str = (void *)malloc(sizeof(char) * str_len); + + if (!bitmask_isallclear(online_cpus)) { + bitmask_displaylist(online_cpus_str, str_len, online_cpus); + printf(_("Following CPUs are online:\n%s\n"), online_cpus_str); + } +} + +/* print_offline_cpus + * + * Print the CPU numbers of all CPUs that are offline currently + */ +void print_offline_cpus(void) +{ + int str_len = 0; + char *offline_cpus_str = NULL; + + str_len = offline_cpus->size * 5; + offline_cpus_str = (void *)malloc(sizeof(char) * str_len); + + if (!bitmask_isallclear(offline_cpus)) { + bitmask_displaylist(offline_cpus_str, str_len, offline_cpus); + printf(_("Following CPUs are offline:\n%s\n"), offline_cpus_str); + printf(_("cpupower set operation was not performed on them\n")); + } +} diff --git a/tools/power/cpupower/utils/helpers/msr.c b/tools/power/cpupower/utils/helpers/msr.c index ab9950748838..8b0b6be74bb8 100644 --- a/tools/power/cpupower/utils/helpers/msr.c +++ b/tools/power/cpupower/utils/helpers/msr.c @@ -11,7 +11,6 @@ /* Intel specific MSRs */ #define MSR_IA32_PERF_STATUS 0x198 #define MSR_IA32_MISC_ENABLES 0x1a0 -#define MSR_IA32_ENERGY_PERF_BIAS 0x1b0 #define MSR_NEHALEM_TURBO_RATIO_LIMIT 0x1ad /* @@ -73,33 +72,6 @@ int write_msr(int cpu, unsigned int idx, unsigned long long val) return -1; } -int msr_intel_get_perf_bias(unsigned int cpu) -{ - unsigned long long val; - int ret; - - if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_PERF_BIAS)) - return -1; - - ret = read_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &val); - if (ret) - return ret; - return val; -} - -int msr_intel_set_perf_bias(unsigned int cpu, unsigned int val) -{ - int ret; - - if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_PERF_BIAS)) - return -1; - - ret = write_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, val); - if (ret) - return ret; - return 0; -} - unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu) { unsigned long long val; diff --git a/tools/power/pm-graph/README b/tools/power/pm-graph/README index 89d0a7dab4bc..da468bd510ca 100644 --- a/tools/power/pm-graph/README +++ b/tools/power/pm-graph/README @@ -6,7 +6,7 @@ |_| |___/ |_| pm-graph: suspend/resume/boot timing analysis tools - Version: 5.7 + Version: 5.8 Author: Todd Brandt <todd.e.brandt@intel.com> Home Page: https://01.org/pm-graph @@ -61,7 +61,7 @@ - runs with python2 or python3, choice is made by /usr/bin/python link - python - python-configparser (for python2 sleepgraph) - - python-requests (for googlesheet.py) + - python-requests (for stresstester.py) - linux-tools-common (for turbostat usage in sleepgraph) Ubuntu: diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index 1bc36a1db14f..81f4b8abbdf7 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -81,7 +81,7 @@ def ascii(text): # store system values and test parameters class SystemValues: title = 'SleepGraph' - version = '5.7' + version = '5.8' ansi = False rs = 0 display = '' @@ -92,8 +92,9 @@ class SystemValues: testlog = True dmesglog = True ftracelog = False + acpidebug = True tstat = True - mindevlen = 0.0 + mindevlen = 0.0001 mincglen = 0.0 cgphase = '' cgtest = -1 @@ -115,6 +116,7 @@ class SystemValues: fpdtpath = '/sys/firmware/acpi/tables/FPDT' epath = '/sys/kernel/debug/tracing/events/power/' pmdpath = '/sys/power/pm_debug_messages' + acpipath='/sys/module/acpi/parameters/debug_level' traceevents = [ 'suspend_resume', 'wakeup_source_activate', @@ -162,16 +164,16 @@ class SystemValues: devdump = False mixedphaseheight = True devprops = dict() + cfgdef = dict() platinfo = [] predelay = 0 postdelay = 0 - pmdebug = '' tmstart = 'SUSPEND START %Y%m%d-%H:%M:%S.%f' tmend = 'RESUME COMPLETE %Y%m%d-%H:%M:%S.%f' tracefuncs = { 'sys_sync': {}, 'ksys_sync': {}, - 'pm_notifier_call_chain_robust': {}, + '__pm_notifier_call_chain': {}, 'pm_prepare_console': {}, 'pm_notifier_call_chain': {}, 'freeze_processes': {}, @@ -490,9 +492,9 @@ class SystemValues: call('echo 0 > %s/wakealarm' % self.rtcpath, shell=True) def initdmesg(self): # get the latest time stamp from the dmesg log - fp = Popen('dmesg', stdout=PIPE).stdout + lines = Popen('dmesg', stdout=PIPE).stdout.readlines() ktime = '0' - for line in fp: + for line in reversed(lines): line = ascii(line).replace('\r\n', '') idx = line.find('[') if idx > 1: @@ -500,7 +502,7 @@ class SystemValues: m = re.match('[ \t]*(\[ *)(?P<ktime>[0-9\.]*)(\]) (?P<msg>.*)', line) if(m): ktime = m.group('ktime') - fp.close() + break self.dmesgstart = float(ktime) def getdmesg(self, testdata): op = self.writeDatafileHeader(self.dmesgfile, testdata) @@ -715,8 +717,6 @@ class SystemValues: self.fsetVal('0', 'events/kprobes/enable') self.fsetVal('', 'kprobe_events') self.fsetVal('1024', 'buffer_size_kb') - if self.pmdebug: - self.setVal(self.pmdebug, self.pmdpath) def setupAllKprobes(self): for name in self.tracefuncs: self.defaultKprobe(name, self.tracefuncs[name]) @@ -740,11 +740,7 @@ class SystemValues: # turn trace off self.fsetVal('0', 'tracing_on') self.cleanupFtrace() - # pm debug messages - pv = self.getVal(self.pmdpath) - if pv != '1': - self.setVal('1', self.pmdpath) - self.pmdebug = pv + self.testVal(self.pmdpath, 'basic', '1') # set the trace clock to global self.fsetVal('global', 'trace_clock') self.fsetVal('nop', 'current_tracer') @@ -900,6 +896,14 @@ class SystemValues: if isgz: return gzip.open(filename, mode+'t') return open(filename, mode) + def putlog(self, filename, text): + with self.openlog(filename, 'a') as fp: + fp.write(text) + fp.close() + def dlog(self, text): + self.putlog(self.dmesgfile, '# %s\n' % text) + def flog(self, text): + self.putlog(self.ftracefile, text) def b64unzip(self, data): try: out = codecs.decode(base64.b64decode(data), 'zlib').decode() @@ -992,9 +996,7 @@ class SystemValues: # add a line for each of these commands with their outputs for name, cmdline, info in cmdafter: footer += '# platform-%s: %s | %s\n' % (name, cmdline, self.b64zip(info)) - - with self.openlog(self.ftracefile, 'a') as fp: - fp.write(footer) + self.flog(footer) return True def commonPrefix(self, list): if len(list) < 2: @@ -1034,6 +1036,7 @@ class SystemValues: cmdline, cmdpath = ' '.join(cargs[2:]), self.getExec(cargs[2]) if not cmdpath or (begin and not delta): continue + self.dlog('[%s]' % cmdline) try: fp = Popen([cmdpath]+cargs[3:], stdout=PIPE, stderr=PIPE).stdout info = ascii(fp.read()).strip() @@ -1060,6 +1063,29 @@ class SystemValues: else: out.append((name, cmdline, '\tnothing' if not info else info)) return out + def testVal(self, file, fmt='basic', value=''): + if file == 'restoreall': + for f in self.cfgdef: + if os.path.exists(f): + fp = open(f, 'w') + fp.write(self.cfgdef[f]) + fp.close() + self.cfgdef = dict() + elif value and os.path.exists(file): + fp = open(file, 'r+') + if fmt == 'radio': + m = re.match('.*\[(?P<v>.*)\].*', fp.read()) + if m: + self.cfgdef[file] = m.group('v') + elif fmt == 'acpi': + line = fp.read().strip().split('\n')[-1] + m = re.match('.* (?P<v>[0-9A-Fx]*) .*', line) + if m: + self.cfgdef[file] = m.group('v') + else: + self.cfgdef[file] = fp.read().strip() + fp.write(value) + fp.close() def haveTurbostat(self): if not self.tstat: return False @@ -1201,6 +1227,57 @@ class SystemValues: self.multitest[sz] *= 1440 elif unit == 'h': self.multitest[sz] *= 60 + def displayControl(self, cmd): + xset, ret = 'timeout 10 xset -d :0.0 {0}', 0 + if self.sudouser: + xset = 'sudo -u %s %s' % (self.sudouser, xset) + if cmd == 'init': + ret = call(xset.format('dpms 0 0 0'), shell=True) + if not ret: + ret = call(xset.format('s off'), shell=True) + elif cmd == 'reset': + ret = call(xset.format('s reset'), shell=True) + elif cmd in ['on', 'off', 'standby', 'suspend']: + b4 = self.displayControl('stat') + ret = call(xset.format('dpms force %s' % cmd), shell=True) + if not ret: + curr = self.displayControl('stat') + self.vprint('Display Switched: %s -> %s' % (b4, curr)) + if curr != cmd: + self.vprint('WARNING: Display failed to change to %s' % cmd) + if ret: + self.vprint('WARNING: Display failed to change to %s with xset' % cmd) + return ret + elif cmd == 'stat': + fp = Popen(xset.format('q').split(' '), stdout=PIPE).stdout + ret = 'unknown' + for line in fp: + m = re.match('[\s]*Monitor is (?P<m>.*)', ascii(line)) + if(m and len(m.group('m')) >= 2): + out = m.group('m').lower() + ret = out[3:] if out[0:2] == 'in' else out + break + fp.close() + return ret + def setRuntimeSuspend(self, before=True): + if before: + # runtime suspend disable or enable + if self.rs > 0: + self.rstgt, self.rsval, self.rsdir = 'on', 'auto', 'enabled' + else: + self.rstgt, self.rsval, self.rsdir = 'auto', 'on', 'disabled' + pprint('CONFIGURING RUNTIME SUSPEND...') + self.rslist = deviceInfo(self.rstgt) + for i in self.rslist: + self.setVal(self.rsval, i) + pprint('runtime suspend %s on all devices (%d changed)' % (self.rsdir, len(self.rslist))) + pprint('waiting 5 seconds...') + time.sleep(5) + else: + # runtime suspend re-enable or re-disable + for i in self.rslist: + self.setVal(self.rstgt, i) + pprint('runtime suspend settings restored on %d devices' % len(self.rslist)) sysvals = SystemValues() switchvalues = ['enable', 'disable', 'on', 'off', 'true', 'false', '1', '0'] @@ -1640,15 +1717,20 @@ class Data: if 'resume_machine' in phase and 'suspend_machine' in lp: tS, tR = self.dmesg[lp]['end'], self.dmesg[phase]['start'] tL = tR - tS - if tL > 0: - left = True if tR > tZero else False - self.trimTime(tS, tL, left) - if 'trying' in self.dmesg[lp] and self.dmesg[lp]['trying'] >= 0.001: - tTry = round(self.dmesg[lp]['trying'] * 1000) - text = '%.0f (-%.0f waking)' % (tL * 1000, tTry) + if tL <= 0: + continue + left = True if tR > tZero else False + self.trimTime(tS, tL, left) + if 'waking' in self.dmesg[lp]: + tCnt = self.dmesg[lp]['waking'][0] + if self.dmesg[lp]['waking'][1] >= 0.001: + tTry = '-%.0f' % (round(self.dmesg[lp]['waking'][1] * 1000)) else: - text = '%.0f' % (tL * 1000) - self.tLow.append(text) + tTry = '-%.3f' % (self.dmesg[lp]['waking'][1] * 1000) + text = '%.0f (%s ms waking %d times)' % (tL * 1000, tTry, tCnt) + else: + text = '%.0f' % (tL * 1000) + self.tLow.append(text) lp = phase def getMemTime(self): if not self.hwstart or not self.hwend: @@ -1921,7 +2003,7 @@ class Data: for dev in list: length = (list[dev]['end'] - list[dev]['start']) * 1000 width = widfmt % (((list[dev]['end']-list[dev]['start'])*100)/tTotal) - if width != '0.000000' and length >= mindevlen: + if length >= mindevlen: devlist.append(dev) self.tdevlist[phase] = devlist def addHorizontalDivider(self, devname, devend): @@ -3316,9 +3398,10 @@ def parseTraceLog(live=False): # trim out s2idle loops, track time trying to freeze llp = data.lastPhase(2) if llp.startswith('suspend_machine'): - if 'trying' not in data.dmesg[llp]: - data.dmesg[llp]['trying'] = 0 - data.dmesg[llp]['trying'] += \ + if 'waking' not in data.dmesg[llp]: + data.dmesg[llp]['waking'] = [0, 0.0] + data.dmesg[llp]['waking'][0] += 1 + data.dmesg[llp]['waking'][1] += \ t.time - data.dmesg[lp]['start'] data.currphase = '' del data.dmesg[lp] @@ -4555,7 +4638,7 @@ def createHTML(testruns, testfail): # draw the devices for this phase phaselist = data.dmesg[b]['list'] for d in sorted(data.tdevlist[b]): - dname = d if '[' not in d else d.split('[')[0] + dname = d if ('[' not in d or 'CPU' in d) else d.split('[')[0] name, dev = dname, phaselist[d] drv = xtraclass = xtrainfo = xtrastyle = '' if 'htmlclass' in dev: @@ -5194,156 +5277,146 @@ def addScriptCode(hf, testruns): '</script>\n' hf.write(script_code); -def setRuntimeSuspend(before=True): - global sysvals - sv = sysvals - if sv.rs == 0: - return - if before: - # runtime suspend disable or enable - if sv.rs > 0: - sv.rstgt, sv.rsval, sv.rsdir = 'on', 'auto', 'enabled' - else: - sv.rstgt, sv.rsval, sv.rsdir = 'auto', 'on', 'disabled' - pprint('CONFIGURING RUNTIME SUSPEND...') - sv.rslist = deviceInfo(sv.rstgt) - for i in sv.rslist: - sv.setVal(sv.rsval, i) - pprint('runtime suspend %s on all devices (%d changed)' % (sv.rsdir, len(sv.rslist))) - pprint('waiting 5 seconds...') - time.sleep(5) - else: - # runtime suspend re-enable or re-disable - for i in sv.rslist: - sv.setVal(sv.rstgt, i) - pprint('runtime suspend settings restored on %d devices' % len(sv.rslist)) - # Function: executeSuspend # Description: # Execute system suspend through the sysfs interface, then copy the output # dmesg and ftrace files to the test output directory. def executeSuspend(quiet=False): - pm = ProcessMonitor() - tp = sysvals.tpath - if sysvals.wifi: - wifi = sysvals.checkWifi() + sv, tp, pm = sysvals, sysvals.tpath, ProcessMonitor() + if sv.wifi: + wifi = sv.checkWifi() + sv.dlog('wifi check, connected device is "%s"' % wifi) testdata = [] # run these commands to prepare the system for suspend - if sysvals.display: + if sv.display: if not quiet: - pprint('SET DISPLAY TO %s' % sysvals.display.upper()) - displayControl(sysvals.display) + pprint('SET DISPLAY TO %s' % sv.display.upper()) + ret = sv.displayControl(sv.display) + sv.dlog('xset display %s, ret = %d' % (sv.display, ret)) time.sleep(1) - if sysvals.sync: + if sv.sync: if not quiet: pprint('SYNCING FILESYSTEMS') + sv.dlog('syncing filesystems') call('sync', shell=True) - # mark the start point in the kernel ring buffer just as we start - sysvals.initdmesg() + sv.dlog('read dmesg') + sv.initdmesg() # start ftrace - if(sysvals.usecallgraph or sysvals.usetraceevents): + if(sv.usecallgraph or sv.usetraceevents): if not quiet: pprint('START TRACING') - sysvals.fsetVal('1', 'tracing_on') - if sysvals.useprocmon: + sv.dlog('start ftrace tracing') + sv.fsetVal('1', 'tracing_on') + if sv.useprocmon: + sv.dlog('start the process monitor') pm.start() - sysvals.cmdinfo(True) + sv.dlog('run the cmdinfo list before') + sv.cmdinfo(True) # execute however many s/r runs requested - for count in range(1,sysvals.execcount+1): + for count in range(1,sv.execcount+1): # x2delay in between test runs - if(count > 1 and sysvals.x2delay > 0): - sysvals.fsetVal('WAIT %d' % sysvals.x2delay, 'trace_marker') - time.sleep(sysvals.x2delay/1000.0) - sysvals.fsetVal('WAIT END', 'trace_marker') + if(count > 1 and sv.x2delay > 0): + sv.fsetVal('WAIT %d' % sv.x2delay, 'trace_marker') + time.sleep(sv.x2delay/1000.0) + sv.fsetVal('WAIT END', 'trace_marker') # start message - if sysvals.testcommand != '': + if sv.testcommand != '': pprint('COMMAND START') else: - if(sysvals.rtcwake): + if(sv.rtcwake): pprint('SUSPEND START') else: pprint('SUSPEND START (press a key to resume)') # set rtcwake - if(sysvals.rtcwake): + if(sv.rtcwake): if not quiet: - pprint('will issue an rtcwake in %d seconds' % sysvals.rtcwaketime) - sysvals.rtcWakeAlarmOn() + pprint('will issue an rtcwake in %d seconds' % sv.rtcwaketime) + sv.dlog('enable RTC wake alarm') + sv.rtcWakeAlarmOn() # start of suspend trace marker - if(sysvals.usecallgraph or sysvals.usetraceevents): - sysvals.fsetVal(datetime.now().strftime(sysvals.tmstart), 'trace_marker') + if(sv.usecallgraph or sv.usetraceevents): + sv.fsetVal(datetime.now().strftime(sv.tmstart), 'trace_marker') # predelay delay - if(count == 1 and sysvals.predelay > 0): - sysvals.fsetVal('WAIT %d' % sysvals.predelay, 'trace_marker') - time.sleep(sysvals.predelay/1000.0) - sysvals.fsetVal('WAIT END', 'trace_marker') + if(count == 1 and sv.predelay > 0): + sv.fsetVal('WAIT %d' % sv.predelay, 'trace_marker') + time.sleep(sv.predelay/1000.0) + sv.fsetVal('WAIT END', 'trace_marker') # initiate suspend or command + sv.dlog('system executing a suspend') tdata = {'error': ''} - if sysvals.testcommand != '': - res = call(sysvals.testcommand+' 2>&1', shell=True); + if sv.testcommand != '': + res = call(sv.testcommand+' 2>&1', shell=True); if res != 0: tdata['error'] = 'cmd returned %d' % res else: - mode = sysvals.suspendmode - if sysvals.memmode and os.path.exists(sysvals.mempowerfile): + mode = sv.suspendmode + if sv.memmode and os.path.exists(sv.mempowerfile): mode = 'mem' - pf = open(sysvals.mempowerfile, 'w') - pf.write(sysvals.memmode) - pf.close() - if sysvals.diskmode and os.path.exists(sysvals.diskpowerfile): + sv.testVal(sv.mempowerfile, 'radio', sv.memmode) + if sv.diskmode and os.path.exists(sv.diskpowerfile): mode = 'disk' - pf = open(sysvals.diskpowerfile, 'w') - pf.write(sysvals.diskmode) - pf.close() - if mode == 'freeze' and sysvals.haveTurbostat(): + sv.testVal(sv.diskpowerfile, 'radio', sv.diskmode) + if sv.acpidebug: + sv.testVal(sv.acpipath, 'acpi', '0xe') + if mode == 'freeze' and sv.haveTurbostat(): # execution will pause here - turbo = sysvals.turbostat() + turbo = sv.turbostat() if turbo: tdata['turbo'] = turbo else: - pf = open(sysvals.powerfile, 'w') + pf = open(sv.powerfile, 'w') pf.write(mode) # execution will pause here try: pf.close() except Exception as e: tdata['error'] = str(e) - if(sysvals.rtcwake): - sysvals.rtcWakeAlarmOff() + sv.dlog('system returned from resume') + # reset everything + sv.testVal('restoreall') + if(sv.rtcwake): + sv.dlog('disable RTC wake alarm') + sv.rtcWakeAlarmOff() # postdelay delay - if(count == sysvals.execcount and sysvals.postdelay > 0): - sysvals.fsetVal('WAIT %d' % sysvals.postdelay, 'trace_marker') - time.sleep(sysvals.postdelay/1000.0) - sysvals.fsetVal('WAIT END', 'trace_marker') + if(count == sv.execcount and sv.postdelay > 0): + sv.fsetVal('WAIT %d' % sv.postdelay, 'trace_marker') + time.sleep(sv.postdelay/1000.0) + sv.fsetVal('WAIT END', 'trace_marker') # return from suspend pprint('RESUME COMPLETE') - if(sysvals.usecallgraph or sysvals.usetraceevents): - sysvals.fsetVal(datetime.now().strftime(sysvals.tmend), 'trace_marker') - if sysvals.wifi and wifi: - tdata['wifi'] = sysvals.pollWifi(wifi) - if(sysvals.suspendmode == 'mem' or sysvals.suspendmode == 'command'): + if(sv.usecallgraph or sv.usetraceevents): + sv.fsetVal(datetime.now().strftime(sv.tmend), 'trace_marker') + if sv.wifi and wifi: + tdata['wifi'] = sv.pollWifi(wifi) + sv.dlog('wifi check, %s' % tdata['wifi']) + if(sv.suspendmode == 'mem' or sv.suspendmode == 'command'): + sv.dlog('read the ACPI FPDT') tdata['fw'] = getFPDT(False) testdata.append(tdata) - cmdafter = sysvals.cmdinfo(False) + sv.dlog('run the cmdinfo list after') + cmdafter = sv.cmdinfo(False) # stop ftrace - if(sysvals.usecallgraph or sysvals.usetraceevents): - if sysvals.useprocmon: + if(sv.usecallgraph or sv.usetraceevents): + if sv.useprocmon: + sv.dlog('stop the process monitor') pm.stop() - sysvals.fsetVal('0', 'tracing_on') + sv.fsetVal('0', 'tracing_on') # grab a copy of the dmesg output if not quiet: pprint('CAPTURING DMESG') - sysvals.getdmesg(testdata) + sysvals.dlog('EXECUTION TRACE END') + sv.getdmesg(testdata) # grab a copy of the ftrace output - if(sysvals.usecallgraph or sysvals.usetraceevents): + if(sv.usecallgraph or sv.usetraceevents): if not quiet: pprint('CAPTURING TRACE') - op = sysvals.writeDatafileHeader(sysvals.ftracefile, testdata) + op = sv.writeDatafileHeader(sv.ftracefile, testdata) fp = open(tp+'trace', 'r') for line in fp: op.write(line) op.close() - sysvals.fsetVal('', 'trace') - sysvals.platforminfo(cmdafter) + sv.fsetVal('', 'trace') + sv.platforminfo(cmdafter) def readFile(file): if os.path.islink(file): @@ -5586,39 +5659,6 @@ def dmidecode(mempath, fatal=False): count += 1 return out -def displayControl(cmd): - xset, ret = 'timeout 10 xset -d :0.0 {0}', 0 - if sysvals.sudouser: - xset = 'sudo -u %s %s' % (sysvals.sudouser, xset) - if cmd == 'init': - ret = call(xset.format('dpms 0 0 0'), shell=True) - if not ret: - ret = call(xset.format('s off'), shell=True) - elif cmd == 'reset': - ret = call(xset.format('s reset'), shell=True) - elif cmd in ['on', 'off', 'standby', 'suspend']: - b4 = displayControl('stat') - ret = call(xset.format('dpms force %s' % cmd), shell=True) - if not ret: - curr = displayControl('stat') - sysvals.vprint('Display Switched: %s -> %s' % (b4, curr)) - if curr != cmd: - sysvals.vprint('WARNING: Display failed to change to %s' % cmd) - if ret: - sysvals.vprint('WARNING: Display failed to change to %s with xset' % cmd) - return ret - elif cmd == 'stat': - fp = Popen(xset.format('q').split(' '), stdout=PIPE).stdout - ret = 'unknown' - for line in fp: - m = re.match('[\s]*Monitor is (?P<m>.*)', ascii(line)) - if(m and len(m.group('m')) >= 2): - out = m.group('m').lower() - ret = out[3:] if out[0:2] == 'in' else out - break - fp.close() - return ret - # Function: getFPDT # Description: # Read the acpi bios tables and pull out FPDT, the firmware data @@ -6001,8 +6041,19 @@ def rerunTest(htmlfile=''): # execute a suspend/resume, gather the logs, and generate the output def runTest(n=0, quiet=False): # prepare for the test - sysvals.initFtrace(quiet) sysvals.initTestOutput('suspend') + op = sysvals.writeDatafileHeader(sysvals.dmesgfile, []) + op.write('# EXECUTION TRACE START\n') + op.close() + if n <= 1: + if sysvals.rs != 0: + sysvals.dlog('%sabling runtime suspend' % ('en' if sysvals.rs > 0 else 'dis')) + sysvals.setRuntimeSuspend(True) + if sysvals.display: + ret = sysvals.displayControl('init') + sysvals.dlog('xset display init, ret = %d' % ret) + sysvals.dlog('initialize ftrace') + sysvals.initFtrace(quiet) # execute the test executeSuspend(quiet) @@ -6098,8 +6149,16 @@ def data_from_html(file, outpath, issues, fulldetail=False): if wifi: extra['wifi'] = wifi low = find_in_html(html, 'freeze time: <b>', ' ms</b>') - if low and 'waking' in low: - issue = 'FREEZEWAKE' + for lowstr in ['waking', '+']: + if not low: + break + if lowstr not in low: + continue + if lowstr == '+': + issue = 'S2LOOPx%d' % len(low.split('+')) + else: + m = re.match('.*waking *(?P<n>[0-9]*) *times.*', low) + issue = 'S2WAKEx%s' % m.group('n') if m else 'S2WAKExNaN' match = [i for i in issues if i['match'] == issue] if len(match) > 0: match[0]['count'] += 1 @@ -6605,6 +6664,11 @@ if __name__ == '__main__': val = next(args) except: doError('-info requires one string argument', True) + elif(arg == '-desc'): + try: + val = next(args) + except: + doError('-desc requires one string argument', True) elif(arg == '-rs'): try: val = next(args) @@ -6814,9 +6878,9 @@ if __name__ == '__main__': runSummary(sysvals.outdir, True, genhtml) elif(cmd in ['xon', 'xoff', 'xstandby', 'xsuspend', 'xinit', 'xreset']): sysvals.verbose = True - ret = displayControl(cmd[1:]) + ret = sysvals.displayControl(cmd[1:]) elif(cmd == 'xstat'): - pprint('Display Status: %s' % displayControl('stat').upper()) + pprint('Display Status: %s' % sysvals.displayControl('stat').upper()) elif(cmd == 'wificheck'): dev = sysvals.checkWifi() if dev: @@ -6854,12 +6918,8 @@ if __name__ == '__main__': if mode.startswith('disk-'): sysvals.diskmode = mode.split('-', 1)[-1] sysvals.suspendmode = 'disk' - sysvals.systemInfo(dmidecode(sysvals.mempath)) - setRuntimeSuspend(True) - if sysvals.display: - displayControl('init') failcnt, ret = 0, 0 if sysvals.multitest['run']: # run multiple tests in a separate subdirectory @@ -6900,7 +6960,10 @@ if __name__ == '__main__': sysvals.testdir = sysvals.outdir # run the test in the current directory ret = runTest() + + # reset to default values after testing if sysvals.display: - displayControl('reset') - setRuntimeSuspend(False) + sysvals.displayControl('reset') + if sysvals.rs != 0: + sysvals.setRuntimeSuspend(False) sys.exit(ret) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index cd089a505859..5390158cdb40 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -15,7 +15,7 @@ struct process_cmd_struct { int arg; }; -static const char *version_str = "v1.6"; +static const char *version_str = "v1.7"; static const int supported_api_ver = 1; static struct isst_if_platform_info isst_platform_info; static char *progname; @@ -328,8 +328,12 @@ int get_physical_die_id(int cpu) int core_id, pkg_id, die_id; ret = get_stored_topology_info(cpu, &core_id, &pkg_id, &die_id); - if (!ret) + if (!ret) { + if (die_id < 0) + die_id = 0; + return die_id; + } } if (ret < 0) diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c index 1d7ecb54352e..8afd23407522 100644 --- a/tools/power/x86/intel-speed-select/isst-core.c +++ b/tools/power/x86/intel-speed-select/isst-core.c @@ -804,7 +804,7 @@ int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev) return ret; } - if (!pkg_dev->enabled) { + if (!pkg_dev->enabled && is_skx_based_platform()) { int freq; freq = get_cpufreq_base_freq(cpu); diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 29715e9c2e06..60db0bb084d5 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -255,4 +255,5 @@ extern int is_clx_n_platform(void); extern int get_cpufreq_base_freq(int cpu); extern int isst_read_pm_config(int cpu, int *cp_state, int *cp_cap); extern void isst_display_error_info_message(int error, char *msg, int arg_valid, int arg); +extern int is_skx_based_platform(void); #endif diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f3a1746f7f45..389ea5209a83 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1831,6 +1831,25 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) return 0; } +int get_epb(int cpu) +{ + char path[128 + PATH_BYTES]; + int ret, epb = -1; + FILE *fp; + + sprintf(path, "/sys/devices/system/cpu/cpu%d/power/energy_perf_bias", cpu); + + fp = fopen_or_die(path, "r"); + + ret = fscanf(fp, "%d", &epb); + if (ret != 1) + err(1, "%s(%s)", __func__, path); + + fclose(fp); + + return epb; +} + void get_apic_id(struct thread_data *t) { unsigned int eax, ebx, ecx, edx; @@ -3917,9 +3936,8 @@ dump_sysfs_pstate_config(void) */ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) { - unsigned long long msr; char *epb_string; - int cpu; + int cpu, epb; if (!has_epb) return 0; @@ -3935,10 +3953,11 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -1; } - if (get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr)) + epb = get_epb(cpu); + if (epb < 0) return 0; - switch (msr & 0xF) { + switch (epb) { case ENERGY_PERF_BIAS_PERFORMANCE: epb_string = "performance"; break; @@ -3952,7 +3971,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) epb_string = "custom"; break; } - fprintf(outf, "cpu%d: MSR_IA32_ENERGY_PERF_BIAS: 0x%08llx (%s)\n", cpu, msr, epb_string); + fprintf(outf, "cpu%d: EPB: %d (%s)\n", cpu, epb, epb_string); return 0; } diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index ff6c6661f075..5fd9e594079c 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -91,6 +91,9 @@ unsigned int has_hwp_request_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int bdx_highest_ratio; +#define PATH_TO_CPU "/sys/devices/system/cpu/" +#define SYSFS_PATH_MAX 255 + /* * maintain compatibility with original implementation, but don't document it: */ @@ -721,6 +724,48 @@ int put_msr(int cpu, int offset, unsigned long long new_msr) return 0; } +static unsigned int read_sysfs(const char *path, char *buf, size_t buflen) +{ + ssize_t numread; + int fd; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +static unsigned int write_sysfs(const char *path, char *buf, size_t buflen) +{ + ssize_t numwritten; + int fd; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + if (numwritten < 1) { + perror("write failed\n"); + close(fd); + return -1; + } + + close(fd); + + return (unsigned int) numwritten; +} + void print_hwp_cap(int cpu, struct msr_hwp_cap *cap, char *str) { if (cpu != -1) @@ -798,17 +843,61 @@ void write_hwp_request(int cpu, struct msr_hwp_request *hwp_req, unsigned int ms put_msr(cpu, msr_offset, msr); } +static int get_epb(int cpu) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[3]; + char *endp; + long val; + + if (!has_epb) + return -1; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/power/energy_perf_bias", cpu); + + if (!read_sysfs(path, linebuf, 3)) + return -1; + + val = strtol(linebuf, &endp, 0); + if (endp == linebuf || errno == ERANGE) + return -1; + + return (int)val; +} + +static int set_epb(int cpu, int val) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[3]; + char *endp; + int ret; + + if (!has_epb) + return -1; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/power/energy_perf_bias", cpu); + snprintf(linebuf, sizeof(linebuf), "%d", val); + + ret = write_sysfs(path, linebuf, 3); + if (ret <= 0) + return -1; + + val = strtol(linebuf, &endp, 0); + if (endp == linebuf || errno == ERANGE) + return -1; + + return (int)val; +} + int print_cpu_msrs(int cpu) { - unsigned long long msr; struct msr_hwp_request req; struct msr_hwp_cap cap; + int epb; - if (has_epb) { - get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr); - - printf("cpu%d: EPB %u\n", cpu, (unsigned int) msr); - } + epb = get_epb(cpu); + if (epb >= 0) + printf("cpu%d: EPB %u\n", cpu, (unsigned int) epb); if (!has_hwp) return 0; @@ -1091,15 +1180,15 @@ int enable_hwp_on_cpu(int cpu) int update_cpu_msrs(int cpu) { unsigned long long msr; - + int epb; if (update_epb) { - get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr); - put_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, new_epb); + epb = get_epb(cpu); + set_epb(cpu, new_epb); if (verbose) printf("cpu%d: ENERGY_PERF_BIAS old: %d new: %d\n", - cpu, (unsigned int) msr, (unsigned int) new_epb); + cpu, epb, (unsigned int) new_epb); } if (update_turbo) { diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index a7974638561c..1358e89cdf7d 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -59,6 +59,16 @@ $(call allow-override,LD,$(CROSS_COMPILE)ld) $(call allow-override,CXX,$(CROSS_COMPILE)g++) $(call allow-override,STRIP,$(CROSS_COMPILE)strip) +ifneq ($(LLVM),) +HOSTAR ?= llvm-ar +HOSTCC ?= clang +HOSTLD ?= ld.lld +else +HOSTAR ?= ar +HOSTCC ?= gcc +HOSTLD ?= ld +endif + ifeq ($(CC_NO_CLANG), 1) EXTRA_WARNINGS += -Wstrict-aliasing=3 endif diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index cb16d2aac51c..54188ee16c48 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -2040,7 +2040,7 @@ sub reboot_to { if ($reboot_type eq "grub") { run_ssh "'(echo \"savedefault --default=$grub_number --once\" | grub --batch)'"; - } elsif ($reboot_type eq "grub2") { + } elsif (($reboot_type eq "grub2") or ($reboot_type eq "grub2bls")) { run_ssh "$grub_reboot $grub_number"; } elsif ($reboot_type eq "syslinux") { run_ssh "$syslinux --once \\\"$syslinux_label\\\" $syslinux_path"; diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 2e3cc0fac726..57c1724b7e5d 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -23,6 +23,11 @@ DEFAULT_KUNITCONFIG_PATH = 'arch/um/configs/kunit_defconfig' BROKEN_ALLCONFIG_PATH = 'tools/testing/kunit/configs/broken_on_uml.config' OUTFILE_PATH = 'test.log' +def get_file_path(build_dir, default): + if build_dir: + default = os.path.join(build_dir, default) + return default + class ConfigError(Exception): """Represents an error trying to configure the Linux kernel.""" @@ -97,9 +102,7 @@ class LinuxSourceTreeOperations(object): def linux_bin(self, params, timeout, build_dir): """Runs the Linux UML binary. Must be named 'linux'.""" - linux_bin = './linux' - if build_dir: - linux_bin = os.path.join(build_dir, 'linux') + linux_bin = get_file_path(build_dir, 'linux') outfile = get_outfile_path(build_dir) with open(outfile, 'w') as output: process = subprocess.Popen([linux_bin] + params, @@ -108,22 +111,13 @@ class LinuxSourceTreeOperations(object): process.wait(timeout) def get_kconfig_path(build_dir): - kconfig_path = KCONFIG_PATH - if build_dir: - kconfig_path = os.path.join(build_dir, KCONFIG_PATH) - return kconfig_path + return get_file_path(build_dir, KCONFIG_PATH) def get_kunitconfig_path(build_dir): - kunitconfig_path = KUNITCONFIG_PATH - if build_dir: - kunitconfig_path = os.path.join(build_dir, KUNITCONFIG_PATH) - return kunitconfig_path + return get_file_path(build_dir, KUNITCONFIG_PATH) def get_outfile_path(build_dir): - outfile_path = OUTFILE_PATH - if build_dir: - outfile_path = os.path.join(build_dir, OUTFILE_PATH) - return outfile_path + return get_file_path(build_dir, OUTFILE_PATH) class LinuxSourceTree(object): """Represents a Linux kernel source tree with KUnit tests.""" diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py index bbfe1b4e4c1c..6614ec4d0898 100644 --- a/tools/testing/kunit/kunit_parser.py +++ b/tools/testing/kunit/kunit_parser.py @@ -135,8 +135,8 @@ def parse_ok_not_ok_test_case(lines: List[str], test_case: TestCase) -> bool: else: return False -SUBTEST_DIAGNOSTIC = re.compile(r'^[\s]+# .*?: (.*)$') -DIAGNOSTIC_CRASH_MESSAGE = 'kunit test case crashed!' +SUBTEST_DIAGNOSTIC = re.compile(r'^[\s]+# (.*)$') +DIAGNOSTIC_CRASH_MESSAGE = re.compile(r'^[\s]+# .*?: kunit test case crashed!$') def parse_diagnostic(lines: List[str], test_case: TestCase) -> bool: save_non_diagnositic(lines, test_case) @@ -146,7 +146,8 @@ def parse_diagnostic(lines: List[str], test_case: TestCase) -> bool: match = SUBTEST_DIAGNOSTIC.match(line) if match: test_case.log.append(lines.pop(0)) - if match.group(1) == DIAGNOSTIC_CRASH_MESSAGE: + crash_match = DIAGNOSTIC_CRASH_MESSAGE.match(line) + if crash_match: test_case.status = TestStatus.TEST_CRASHED return True else: diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 2ac0fff6dad8..9b185bf82da8 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -23,7 +23,6 @@ #include "nfit_test.h" #include "../watermark.h" -#include <asm/copy_mc_test.h> #include <asm/mce.h> /* @@ -3284,107 +3283,6 @@ static struct platform_driver nfit_test_driver = { .id_table = nfit_test_id, }; -static char copy_mc_buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); - -enum INJECT { - INJECT_NONE, - INJECT_SRC, - INJECT_DST, -}; - -static void copy_mc_test_init(char *dst, char *src, size_t size) -{ - size_t i; - - memset(dst, 0xff, size); - for (i = 0; i < size; i++) - src[i] = (char) i; -} - -static bool copy_mc_test_validate(unsigned char *dst, unsigned char *src, - size_t size, unsigned long rem) -{ - size_t i; - - for (i = 0; i < size - rem; i++) - if (dst[i] != (unsigned char) i) { - pr_info_once("%s:%d: offset: %zd got: %#x expect: %#x\n", - __func__, __LINE__, i, dst[i], - (unsigned char) i); - return false; - } - for (i = size - rem; i < size; i++) - if (dst[i] != 0xffU) { - pr_info_once("%s:%d: offset: %zd got: %#x expect: 0xff\n", - __func__, __LINE__, i, dst[i]); - return false; - } - return true; -} - -void copy_mc_test(void) -{ - char *inject_desc[] = { "none", "source", "destination" }; - enum INJECT inj; - - if (IS_ENABLED(CONFIG_COPY_MC_TEST)) { - pr_info("%s: run...\n", __func__); - } else { - pr_info("%s: disabled, skip.\n", __func__); - return; - } - - for (inj = INJECT_NONE; inj <= INJECT_DST; inj++) { - int i; - - pr_info("%s: inject: %s\n", __func__, inject_desc[inj]); - for (i = 0; i < 512; i++) { - unsigned long expect, rem; - void *src, *dst; - bool valid; - - switch (inj) { - case INJECT_NONE: - copy_mc_inject_src(NULL); - copy_mc_inject_dst(NULL); - dst = ©_mc_buf[2048]; - src = ©_mc_buf[1024 - i]; - expect = 0; - break; - case INJECT_SRC: - copy_mc_inject_src(©_mc_buf[1024]); - copy_mc_inject_dst(NULL); - dst = ©_mc_buf[2048]; - src = ©_mc_buf[1024 - i]; - expect = 512 - i; - break; - case INJECT_DST: - copy_mc_inject_src(NULL); - copy_mc_inject_dst(©_mc_buf[2048]); - dst = ©_mc_buf[2048 - i]; - src = ©_mc_buf[1024]; - expect = 512 - i; - break; - } - - copy_mc_test_init(dst, src, 512); - rem = copy_mc_fragile(dst, src, 512); - valid = copy_mc_test_validate(dst, src, 512, expect); - if (rem == expect && valid) - continue; - pr_info("%s: copy(%#lx, %#lx, %d) off: %d rem: %ld %s expect: %ld\n", - __func__, - ((unsigned long) dst) & ~PAGE_MASK, - ((unsigned long ) src) & ~PAGE_MASK, - 512, i, rem, valid ? "valid" : "bad", - expect); - } - } - - copy_mc_inject_src(NULL); - copy_mc_inject_dst(NULL); -} - static __init int nfit_test_init(void) { int rc, i; @@ -3393,7 +3291,6 @@ static __init int nfit_test_init(void) libnvdimm_test(); acpi_nfit_test(); device_dax_test(); - copy_mc_test(); dax_pmem_test(); dax_pmem_core_test(); #ifdef CONFIG_DEV_DAX_PMEM_COMPAT diff --git a/tools/testing/scatterlist/main.c b/tools/testing/scatterlist/main.c index f561aed7c657..71c960dcd8a4 100644 --- a/tools/testing/scatterlist/main.c +++ b/tools/testing/scatterlist/main.c @@ -9,6 +9,7 @@ struct test { int alloc_ret; unsigned num_pages; unsigned *pfn; + unsigned *pfn_app; unsigned size; unsigned int max_seg; unsigned int expected_segments; @@ -50,33 +51,41 @@ static void fail(struct test *test, struct sg_table *st, const char *cond) int main(void) { - const unsigned int sgmax = SCATTERLIST_MAX_SEGMENT; + const unsigned int sgmax = UINT_MAX; struct test *test, tests[] = { - { -EINVAL, 1, pfn(0), PAGE_SIZE, 0, 1 }, - { 0, 1, pfn(0), PAGE_SIZE, PAGE_SIZE + 1, 1 }, - { 0, 1, pfn(0), PAGE_SIZE, sgmax + 1, 1 }, - { 0, 1, pfn(0), PAGE_SIZE, sgmax, 1 }, - { 0, 1, pfn(0), 1, sgmax, 1 }, - { 0, 2, pfn(0, 1), 2 * PAGE_SIZE, sgmax, 1 }, - { 0, 2, pfn(1, 0), 2 * PAGE_SIZE, sgmax, 2 }, - { 0, 3, pfn(0, 1, 2), 3 * PAGE_SIZE, sgmax, 1 }, - { 0, 3, pfn(0, 2, 1), 3 * PAGE_SIZE, sgmax, 3 }, - { 0, 3, pfn(0, 1, 3), 3 * PAGE_SIZE, sgmax, 2 }, - { 0, 3, pfn(1, 2, 4), 3 * PAGE_SIZE, sgmax, 2 }, - { 0, 3, pfn(1, 3, 4), 3 * PAGE_SIZE, sgmax, 2 }, - { 0, 4, pfn(0, 1, 3, 4), 4 * PAGE_SIZE, sgmax, 2 }, - { 0, 5, pfn(0, 1, 3, 4, 5), 5 * PAGE_SIZE, sgmax, 2 }, - { 0, 5, pfn(0, 1, 3, 4, 6), 5 * PAGE_SIZE, sgmax, 3 }, - { 0, 5, pfn(0, 1, 2, 3, 4), 5 * PAGE_SIZE, sgmax, 1 }, - { 0, 5, pfn(0, 1, 2, 3, 4), 5 * PAGE_SIZE, 2 * PAGE_SIZE, 3 }, - { 0, 6, pfn(0, 1, 2, 3, 4, 5), 6 * PAGE_SIZE, 2 * PAGE_SIZE, 3 }, - { 0, 6, pfn(0, 2, 3, 4, 5, 6), 6 * PAGE_SIZE, 2 * PAGE_SIZE, 4 }, - { 0, 6, pfn(0, 1, 3, 4, 5, 6), 6 * PAGE_SIZE, 2 * PAGE_SIZE, 3 }, - { 0, 0, NULL, 0, 0, 0 }, + { -EINVAL, 1, pfn(0), NULL, PAGE_SIZE, 0, 1 }, + { 0, 1, pfn(0), NULL, PAGE_SIZE, PAGE_SIZE + 1, 1 }, + { 0, 1, pfn(0), NULL, PAGE_SIZE, sgmax + 1, 1 }, + { 0, 1, pfn(0), NULL, PAGE_SIZE, sgmax, 1 }, + { 0, 1, pfn(0), NULL, 1, sgmax, 1 }, + { 0, 2, pfn(0, 1), NULL, 2 * PAGE_SIZE, sgmax, 1 }, + { 0, 2, pfn(1, 0), NULL, 2 * PAGE_SIZE, sgmax, 2 }, + { 0, 3, pfn(0, 1, 2), NULL, 3 * PAGE_SIZE, sgmax, 1 }, + { 0, 3, pfn(0, 1, 2), NULL, 3 * PAGE_SIZE, sgmax, 1 }, + { 0, 3, pfn(0, 1, 2), pfn(3, 4, 5), 3 * PAGE_SIZE, sgmax, 1 }, + { 0, 3, pfn(0, 1, 2), pfn(4, 5, 6), 3 * PAGE_SIZE, sgmax, 2 }, + { 0, 3, pfn(0, 2, 1), NULL, 3 * PAGE_SIZE, sgmax, 3 }, + { 0, 3, pfn(0, 1, 3), NULL, 3 * PAGE_SIZE, sgmax, 2 }, + { 0, 3, pfn(1, 2, 4), NULL, 3 * PAGE_SIZE, sgmax, 2 }, + { 0, 3, pfn(1, 3, 4), NULL, 3 * PAGE_SIZE, sgmax, 2 }, + { 0, 4, pfn(0, 1, 3, 4), NULL, 4 * PAGE_SIZE, sgmax, 2 }, + { 0, 5, pfn(0, 1, 3, 4, 5), NULL, 5 * PAGE_SIZE, sgmax, 2 }, + { 0, 5, pfn(0, 1, 3, 4, 6), NULL, 5 * PAGE_SIZE, sgmax, 3 }, + { 0, 5, pfn(0, 1, 2, 3, 4), NULL, 5 * PAGE_SIZE, sgmax, 1 }, + { 0, 5, pfn(0, 1, 2, 3, 4), NULL, 5 * PAGE_SIZE, 2 * PAGE_SIZE, + 3 }, + { 0, 6, pfn(0, 1, 2, 3, 4, 5), NULL, 6 * PAGE_SIZE, + 2 * PAGE_SIZE, 3 }, + { 0, 6, pfn(0, 2, 3, 4, 5, 6), NULL, 6 * PAGE_SIZE, + 2 * PAGE_SIZE, 4 }, + { 0, 6, pfn(0, 1, 3, 4, 5, 6), pfn(7, 8, 9, 10, 11, 12), + 6 * PAGE_SIZE, 12 * PAGE_SIZE, 2 }, + { 0, 0, NULL, NULL, 0, 0, 0 }, }; unsigned int i; for (i = 0, test = tests; test->expected_segments; test++, i++) { + int left_pages = test->pfn_app ? test->num_pages : 0; struct page *pages[MAX_PAGES]; struct sg_table st; struct scatterlist *sg; @@ -84,14 +93,23 @@ int main(void) set_pages(pages, test->pfn, test->num_pages); sg = __sg_alloc_table_from_pages(&st, pages, test->num_pages, 0, - test->size, test->max_seg, NULL, 0, GFP_KERNEL); + test->size, test->max_seg, NULL, left_pages, GFP_KERNEL); assert(PTR_ERR_OR_ZERO(sg) == test->alloc_ret); if (test->alloc_ret) continue; + if (test->pfn_app) { + set_pages(pages, test->pfn_app, test->num_pages); + sg = __sg_alloc_table_from_pages(&st, pages, test->num_pages, 0, + test->size, test->max_seg, sg, 0, GFP_KERNEL); + + assert(PTR_ERR_OR_ZERO(sg) == test->alloc_ret); + } + VALIDATE(st.nents == test->expected_segments, &st, test); - VALIDATE(st.orig_nents == test->expected_segments, &st, test); + if (!test->pfn_app) + VALIDATE(st.orig_nents == test->expected_segments, &st, test); sg_free_table(&st); } diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index d9c283503159..afbab4aeef3c 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -TARGETS = android -TARGETS += arm64 +TARGETS = arm64 TARGETS += bpf TARGETS += breakpoints TARGETS += capabilities @@ -50,12 +49,14 @@ TARGETS += openat2 TARGETS += rseq TARGETS += rtc TARGETS += seccomp +TARGETS += sgx TARGETS += sigaltstack TARGETS += size TARGETS += sparc64 TARGETS += splice TARGETS += static_keys TARGETS += sync +TARGETS += syscall_user_dispatch TARGETS += sysctl TARGETS += tc-testing TARGETS += timens @@ -65,6 +66,7 @@ endif TARGETS += tmpfs TARGETS += tpm2 TARGETS += user +TARGETS += vDSO TARGETS += vm TARGETS += x86 TARGETS += zram diff --git a/tools/testing/selftests/android/Makefile b/tools/testing/selftests/android/Makefile deleted file mode 100644 index 9258306cafe9..000000000000 --- a/tools/testing/selftests/android/Makefile +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -SUBDIRS := ion - -TEST_PROGS := run.sh - -.PHONY: all clean - -include ../lib.mk - -all: - @for DIR in $(SUBDIRS); do \ - BUILD_TARGET=$(OUTPUT)/$$DIR; \ - mkdir $$BUILD_TARGET -p; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ - #SUBDIR test prog name should be in the form: SUBDIR_test.sh \ - TEST=$$DIR"_test.sh"; \ - if [ -e $$DIR/$$TEST ]; then \ - rsync -a $$DIR/$$TEST $$BUILD_TARGET/; \ - fi \ - done - -override define INSTALL_RULE - mkdir -p $(INSTALL_PATH) -install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) - - @for SUBDIR in $(SUBDIRS); do \ - BUILD_TARGET=$(OUTPUT)/$$SUBDIR; \ - mkdir $$BUILD_TARGET -p; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -C $$SUBDIR INSTALL_PATH=$(INSTALL_PATH)/$$SUBDIR install; \ - done; -endef - -override define CLEAN - @for DIR in $(SUBDIRS); do \ - BUILD_TARGET=$(OUTPUT)/$$DIR; \ - mkdir $$BUILD_TARGET -p; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ - done -endef diff --git a/tools/testing/selftests/android/config b/tools/testing/selftests/android/config deleted file mode 100644 index b4ad748a9dd9..000000000000 --- a/tools/testing/selftests/android/config +++ /dev/null @@ -1,5 +0,0 @@ -CONFIG_ANDROID=y -CONFIG_STAGING=y -CONFIG_ION=y -CONFIG_ION_SYSTEM_HEAP=y -CONFIG_DRM_VGEM=y diff --git a/tools/testing/selftests/android/ion/Makefile b/tools/testing/selftests/android/ion/Makefile deleted file mode 100644 index 42b71f005332..000000000000 --- a/tools/testing/selftests/android/ion/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -INCLUDEDIR := -I. -I../../../../../drivers/staging/android/uapi/ -I../../../../../usr/include/ -CFLAGS := $(CFLAGS) $(INCLUDEDIR) -Wall -O2 -g - -TEST_GEN_FILES := ionapp_export ionapp_import ionmap_test - -all: $(TEST_GEN_FILES) - -$(TEST_GEN_FILES): ipcsocket.c ionutils.c - -TEST_PROGS := ion_test.sh - -KSFT_KHDR_INSTALL := 1 -top_srcdir = ../../../../.. -include ../../lib.mk - -$(OUTPUT)/ionapp_export: ionapp_export.c ipcsocket.c ionutils.c -$(OUTPUT)/ionapp_import: ionapp_import.c ipcsocket.c ionutils.c -$(OUTPUT)/ionmap_test: ionmap_test.c ionutils.c ipcsocket.c diff --git a/tools/testing/selftests/android/ion/README b/tools/testing/selftests/android/ion/README deleted file mode 100644 index 21783e9c451e..000000000000 --- a/tools/testing/selftests/android/ion/README +++ /dev/null @@ -1,101 +0,0 @@ -ION BUFFER SHARING UTILITY -========================== -File: ion_test.sh : Utility to test ION driver buffer sharing mechanism. -Author: Pintu Kumar <pintu.ping@gmail.com> - -Introduction: -------------- -This is a test utility to verify ION buffer sharing in user space -between 2 independent processes. -It uses unix domain socket (with SCM_RIGHTS) as IPC to transfer an FD to -another process to share the same buffer. -This utility demonstrates how ION buffer sharing can be implemented between -two user space processes, using various heap types. -The following heap types are supported by ION driver. -ION_HEAP_TYPE_SYSTEM (0) -ION_HEAP_TYPE_SYSTEM_CONTIG (1) -ION_HEAP_TYPE_CARVEOUT (2) -ION_HEAP_TYPE_CHUNK (3) -ION_HEAP_TYPE_DMA (4) - -By default only the SYSTEM and SYSTEM_CONTIG heaps are supported. -Each heap is associated with the respective heap id. -This utility is designed in the form of client/server program. -The server part (ionapp_export) is the exporter of the buffer. -It is responsible for creating an ION client, allocating the buffer based on -the heap id, writing some data to this buffer and then exporting the FD -(associated with this buffer) to another process using socket IPC. -This FD is called as buffer FD (which is different than the ION client FD). - -The client part (ionapp_import) is the importer of the buffer. -It retrives the FD from the socket data and installs into its address space. -This new FD internally points to the same kernel buffer. -So first it reads the data that is stored in this buffer and prints it. -Then it writes the different size of data (it could be different data) to the -same buffer. -Finally the buffer FD must be closed by both the exporter and importer. -Thus the same kernel buffer is shared among two user space processes using -ION driver and only one time allocation. - -Prerequisite: -------------- -This utility works only if /dev/ion interface is present. -The following configs needs to be enabled in kernel to include ion driver. -CONFIG_ANDROID=y -CONFIG_STAGING=y -CONFIG_ION=y -CONFIG_ION_SYSTEM_HEAP=y - -This utility requires to be run as root user. - - -Compile and test: ------------------ -This utility is made to be run as part of kselftest framework in kernel. -To compile and run using kselftest you can simply do the following from the -kernel top directory. -linux$ make TARGETS=android kselftest -Or you can also use: -linux$ make -C tools/testing/selftests TARGETS=android run_tests -Using the selftest it can directly execute the ion_test.sh script to test the -buffer sharing using ion system heap. -Currently the heap size is hard coded as just 10 bytes inside this script. -You need to be a root user to run under selftest. - -You can also compile and test manually using the following steps: -ion$ make -These will generate 2 executable: ionapp_export, ionapp_import -Now you can run the export and import manually by specifying the heap type -and the heap size. -You can also directly execute the shell script to run the test automatically. -Simply use the following command to run the test. -ion$ sudo ./ion_test.sh - -Test Results: -------------- -The utility is verified on Ubuntu-32 bit system with Linux Kernel 4.14. -Here is the snapshot of the test result using kselftest. - -linux# make TARGETS=android kselftest -heap_type: 0, heap_size: 10 --------------------------------------- -heap type: 0 - heap id: 1 -heap name: ion_system_heap --------------------------------------- -Fill buffer content: -0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd -Sharing fd: 6, Client fd: 5 -<ion_close_buffer_fd>: buffer release successfully.... -Received buffer fd: 4 -Read buffer content: -0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0x0 0x0 0x0 0x0 0x0 0x0 -0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 -Fill buffer content: -0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd -0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd -0xfd 0xfd -<ion_close_buffer_fd>: buffer release successfully.... -ion_test.sh: heap_type: 0 - [PASS] - -ion_test.sh: done diff --git a/tools/testing/selftests/android/ion/ion.h b/tools/testing/selftests/android/ion/ion.h deleted file mode 100644 index 33db23018abf..000000000000 --- a/tools/testing/selftests/android/ion/ion.h +++ /dev/null @@ -1,134 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * ion.h - * - * Copyright (C) 2011 Google, Inc. - */ - -/* This file is copied from drivers/staging/android/uapi/ion.h - * This local copy is required for the selftest to pass, when build - * outside the kernel source tree. - * Please keep this file in sync with its original file until the - * ion driver is moved outside the staging tree. - */ - -#ifndef _UAPI_LINUX_ION_H -#define _UAPI_LINUX_ION_H - -#include <linux/ioctl.h> -#include <linux/types.h> - -/** - * enum ion_heap_types - list of all possible types of heaps - * @ION_HEAP_TYPE_SYSTEM: memory allocated via vmalloc - * @ION_HEAP_TYPE_SYSTEM_CONTIG: memory allocated via kmalloc - * @ION_HEAP_TYPE_CARVEOUT: memory allocated from a prereserved - * carveout heap, allocations are physically - * contiguous - * @ION_HEAP_TYPE_DMA: memory allocated via DMA API - * @ION_NUM_HEAPS: helper for iterating over heaps, a bit mask - * is used to identify the heaps, so only 32 - * total heap types are supported - */ -enum ion_heap_type { - ION_HEAP_TYPE_SYSTEM, - ION_HEAP_TYPE_SYSTEM_CONTIG, - ION_HEAP_TYPE_CARVEOUT, - ION_HEAP_TYPE_CHUNK, - ION_HEAP_TYPE_DMA, - ION_HEAP_TYPE_CUSTOM, /* - * must be last so device specific heaps always - * are at the end of this enum - */ -}; - -#define ION_NUM_HEAP_IDS (sizeof(unsigned int) * 8) - -/** - * allocation flags - the lower 16 bits are used by core ion, the upper 16 - * bits are reserved for use by the heaps themselves. - */ - -/* - * mappings of this buffer should be cached, ion will do cache maintenance - * when the buffer is mapped for dma - */ -#define ION_FLAG_CACHED 1 - -/** - * DOC: Ion Userspace API - * - * create a client by opening /dev/ion - * most operations handled via following ioctls - * - */ - -/** - * struct ion_allocation_data - metadata passed from userspace for allocations - * @len: size of the allocation - * @heap_id_mask: mask of heap ids to allocate from - * @flags: flags passed to heap - * @handle: pointer that will be populated with a cookie to use to - * refer to this allocation - * - * Provided by userspace as an argument to the ioctl - */ -struct ion_allocation_data { - __u64 len; - __u32 heap_id_mask; - __u32 flags; - __u32 fd; - __u32 unused; -}; - -#define MAX_HEAP_NAME 32 - -/** - * struct ion_heap_data - data about a heap - * @name - first 32 characters of the heap name - * @type - heap type - * @heap_id - heap id for the heap - */ -struct ion_heap_data { - char name[MAX_HEAP_NAME]; - __u32 type; - __u32 heap_id; - __u32 reserved0; - __u32 reserved1; - __u32 reserved2; -}; - -/** - * struct ion_heap_query - collection of data about all heaps - * @cnt - total number of heaps to be copied - * @heaps - buffer to copy heap data - */ -struct ion_heap_query { - __u32 cnt; /* Total number of heaps to be copied */ - __u32 reserved0; /* align to 64bits */ - __u64 heaps; /* buffer to be populated */ - __u32 reserved1; - __u32 reserved2; -}; - -#define ION_IOC_MAGIC 'I' - -/** - * DOC: ION_IOC_ALLOC - allocate memory - * - * Takes an ion_allocation_data struct and returns it with the handle field - * populated with the opaque handle for the allocation. - */ -#define ION_IOC_ALLOC _IOWR(ION_IOC_MAGIC, 0, \ - struct ion_allocation_data) - -/** - * DOC: ION_IOC_HEAP_QUERY - information about available heaps - * - * Takes an ion_heap_query structure and populates information about - * available Ion heaps. - */ -#define ION_IOC_HEAP_QUERY _IOWR(ION_IOC_MAGIC, 8, \ - struct ion_heap_query) - -#endif /* _UAPI_LINUX_ION_H */ diff --git a/tools/testing/selftests/android/ion/ion_test.sh b/tools/testing/selftests/android/ion/ion_test.sh deleted file mode 100755 index 69e676cfc94e..000000000000 --- a/tools/testing/selftests/android/ion/ion_test.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -heapsize=4096 -TCID="ion_test.sh" -errcode=0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -run_test() -{ - heaptype=$1 - ./ionapp_export -i $heaptype -s $heapsize & - sleep 1 - ./ionapp_import - if [ $? -ne 0 ]; then - echo "$TCID: heap_type: $heaptype - [FAIL]" - errcode=1 - else - echo "$TCID: heap_type: $heaptype - [PASS]" - fi - sleep 1 - echo "" -} - -check_root() -{ - uid=$(id -u) - if [ $uid -ne 0 ]; then - echo $TCID: must be run as root >&2 - exit $ksft_skip - fi -} - -check_device() -{ - DEVICE=/dev/ion - if [ ! -e $DEVICE ]; then - echo $TCID: No $DEVICE device found >&2 - echo $TCID: May be CONFIG_ION is not set >&2 - exit $ksft_skip - fi -} - -main_function() -{ - check_device - check_root - - # ION_SYSTEM_HEAP TEST - run_test 0 - # ION_SYSTEM_CONTIG_HEAP TEST - run_test 1 -} - -main_function -echo "$TCID: done" -exit $errcode diff --git a/tools/testing/selftests/android/ion/ionapp_export.c b/tools/testing/selftests/android/ion/ionapp_export.c deleted file mode 100644 index 063b7830d1bd..000000000000 --- a/tools/testing/selftests/android/ion/ionapp_export.c +++ /dev/null @@ -1,127 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * ionapp_export.c - * - * It is a user space utility to create and export android - * ion memory buffer fd to another process using unix domain socket as IPC. - * This acts like a server for ionapp_import(client). - * So, this server has to be started first before the client. - * - * Copyright (C) 2017 Pintu Kumar <pintu.ping@gmail.com> - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <errno.h> -#include <sys/time.h> -#include "ionutils.h" -#include "ipcsocket.h" - - -void print_usage(int argc, char *argv[]) -{ - printf("Usage: %s [-h <help>] [-i <heap id>] [-s <size in bytes>]\n", - argv[0]); -} - -int main(int argc, char *argv[]) -{ - int opt, ret, status, heapid; - int sockfd, client_fd, shared_fd; - unsigned char *map_buf; - unsigned long map_len, heap_type, heap_size, flags; - struct ion_buffer_info info; - struct socket_info skinfo; - - if (argc < 2) { - print_usage(argc, argv); - return -1; - } - - heap_size = 0; - flags = 0; - heap_type = ION_HEAP_TYPE_SYSTEM; - - while ((opt = getopt(argc, argv, "hi:s:")) != -1) { - switch (opt) { - case 'h': - print_usage(argc, argv); - exit(0); - break; - case 'i': - heapid = atoi(optarg); - switch (heapid) { - case 0: - heap_type = ION_HEAP_TYPE_SYSTEM; - break; - case 1: - heap_type = ION_HEAP_TYPE_SYSTEM_CONTIG; - break; - default: - printf("ERROR: heap type not supported\n"); - exit(1); - } - break; - case 's': - heap_size = atoi(optarg); - break; - default: - print_usage(argc, argv); - exit(1); - break; - } - } - - if (heap_size <= 0) { - printf("heap_size cannot be 0\n"); - print_usage(argc, argv); - exit(1); - } - - printf("heap_type: %ld, heap_size: %ld\n", heap_type, heap_size); - info.heap_type = heap_type; - info.heap_size = heap_size; - info.flag_type = flags; - - /* This is server: open the socket connection first */ - /* Here; 1 indicates server or exporter */ - status = opensocket(&sockfd, SOCKET_NAME, 1); - if (status < 0) { - fprintf(stderr, "<%s>: Failed opensocket.\n", __func__); - goto err_socket; - } - skinfo.sockfd = sockfd; - - ret = ion_export_buffer_fd(&info); - if (ret < 0) { - fprintf(stderr, "FAILED: ion_get_buffer_fd\n"); - goto err_export; - } - client_fd = info.ionfd; - shared_fd = info.buffd; - map_buf = info.buffer; - map_len = info.buflen; - write_buffer(map_buf, map_len); - - /* share ion buf fd with other user process */ - printf("Sharing fd: %d, Client fd: %d\n", shared_fd, client_fd); - skinfo.datafd = shared_fd; - skinfo.buflen = map_len; - - ret = socket_send_fd(&skinfo); - if (ret < 0) { - fprintf(stderr, "FAILED: socket_send_fd\n"); - goto err_send; - } - -err_send: -err_export: - ion_close_buffer_fd(&info); - -err_socket: - closesocket(sockfd, SOCKET_NAME); - - return 0; -} diff --git a/tools/testing/selftests/android/ion/ionapp_import.c b/tools/testing/selftests/android/ion/ionapp_import.c deleted file mode 100644 index 54b580cb04f6..000000000000 --- a/tools/testing/selftests/android/ion/ionapp_import.c +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * ionapp_import.c - * - * It is a user space utility to receive android ion memory buffer fd - * over unix domain socket IPC that can be exported by ionapp_export. - * This acts like a client for ionapp_export. - * - * Copyright (C) 2017 Pintu Kumar <pintu.ping@gmail.com> - */ - -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <string.h> -#include "ionutils.h" -#include "ipcsocket.h" - - -int main(void) -{ - int ret, status; - int sockfd, shared_fd; - unsigned char *map_buf; - unsigned long map_len; - struct ion_buffer_info info; - struct socket_info skinfo; - - /* This is the client part. Here 0 means client or importer */ - status = opensocket(&sockfd, SOCKET_NAME, 0); - if (status < 0) { - fprintf(stderr, "No exporter exists...\n"); - ret = status; - goto err_socket; - } - - skinfo.sockfd = sockfd; - - ret = socket_receive_fd(&skinfo); - if (ret < 0) { - fprintf(stderr, "Failed: socket_receive_fd\n"); - goto err_recv; - } - - shared_fd = skinfo.datafd; - printf("Received buffer fd: %d\n", shared_fd); - if (shared_fd <= 0) { - fprintf(stderr, "ERROR: improper buf fd\n"); - ret = -1; - goto err_fd; - } - - memset(&info, 0, sizeof(info)); - info.buffd = shared_fd; - info.buflen = ION_BUFFER_LEN; - - ret = ion_import_buffer_fd(&info); - if (ret < 0) { - fprintf(stderr, "Failed: ion_use_buffer_fd\n"); - goto err_import; - } - - map_buf = info.buffer; - map_len = info.buflen; - read_buffer(map_buf, map_len); - - /* Write probably new data to the same buffer again */ - map_len = ION_BUFFER_LEN; - write_buffer(map_buf, map_len); - -err_import: - ion_close_buffer_fd(&info); -err_fd: -err_recv: -err_socket: - closesocket(sockfd, SOCKET_NAME); - - return ret; -} diff --git a/tools/testing/selftests/android/ion/ionmap_test.c b/tools/testing/selftests/android/ion/ionmap_test.c deleted file mode 100644 index dab36b06b37d..000000000000 --- a/tools/testing/selftests/android/ion/ionmap_test.c +++ /dev/null @@ -1,136 +0,0 @@ -#include <errno.h> -#include <fcntl.h> -#include <stdio.h> -#include <stdint.h> -#include <string.h> -#include <unistd.h> - -#include <sys/ioctl.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include <linux/dma-buf.h> - -#include <drm/drm.h> - -#include "ion.h" -#include "ionutils.h" - -int check_vgem(int fd) -{ - drm_version_t version = { 0 }; - char name[5]; - int ret; - - version.name_len = 4; - version.name = name; - - ret = ioctl(fd, DRM_IOCTL_VERSION, &version); - if (ret) - return 1; - - return strcmp(name, "vgem"); -} - -int open_vgem(void) -{ - int i, fd; - const char *drmstr = "/dev/dri/card"; - - fd = -1; - for (i = 0; i < 16; i++) { - char name[80]; - - sprintf(name, "%s%u", drmstr, i); - - fd = open(name, O_RDWR); - if (fd < 0) - continue; - - if (check_vgem(fd)) { - close(fd); - continue; - } else { - break; - } - - } - return fd; -} - -int import_vgem_fd(int vgem_fd, int dma_buf_fd, uint32_t *handle) -{ - struct drm_prime_handle import_handle = { 0 }; - int ret; - - import_handle.fd = dma_buf_fd; - import_handle.flags = 0; - import_handle.handle = 0; - - ret = ioctl(vgem_fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &import_handle); - if (ret == 0) - *handle = import_handle.handle; - return ret; -} - -void close_handle(int vgem_fd, uint32_t handle) -{ - struct drm_gem_close close = { 0 }; - - close.handle = handle; - ioctl(vgem_fd, DRM_IOCTL_GEM_CLOSE, &close); -} - -int main() -{ - int ret, vgem_fd; - struct ion_buffer_info info; - uint32_t handle = 0; - struct dma_buf_sync sync = { 0 }; - - info.heap_type = ION_HEAP_TYPE_SYSTEM; - info.heap_size = 4096; - info.flag_type = ION_FLAG_CACHED; - - ret = ion_export_buffer_fd(&info); - if (ret < 0) { - printf("ion buffer alloc failed\n"); - return -1; - } - - vgem_fd = open_vgem(); - if (vgem_fd < 0) { - ret = vgem_fd; - printf("Failed to open vgem\n"); - goto out_ion; - } - - ret = import_vgem_fd(vgem_fd, info.buffd, &handle); - - if (ret < 0) { - printf("Failed to import buffer\n"); - goto out_vgem; - } - - sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_RW; - ret = ioctl(info.buffd, DMA_BUF_IOCTL_SYNC, &sync); - if (ret) - printf("sync start failed %d\n", errno); - - memset(info.buffer, 0xff, 4096); - - sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW; - ret = ioctl(info.buffd, DMA_BUF_IOCTL_SYNC, &sync); - if (ret) - printf("sync end failed %d\n", errno); - - close_handle(vgem_fd, handle); - ret = 0; - -out_vgem: - close(vgem_fd); -out_ion: - ion_close_buffer_fd(&info); - printf("done.\n"); - return ret; -} diff --git a/tools/testing/selftests/android/ion/ionutils.c b/tools/testing/selftests/android/ion/ionutils.c deleted file mode 100644 index 7d1d37c4ef6a..000000000000 --- a/tools/testing/selftests/android/ion/ionutils.c +++ /dev/null @@ -1,253 +0,0 @@ -#include <stdio.h> -#include <string.h> -#include <unistd.h> -#include <fcntl.h> -#include <errno.h> -//#include <stdint.h> -#include <sys/ioctl.h> -#include <sys/mman.h> -#include "ionutils.h" -#include "ipcsocket.h" - - -void write_buffer(void *buffer, unsigned long len) -{ - int i; - unsigned char *ptr = (unsigned char *)buffer; - - if (!ptr) { - fprintf(stderr, "<%s>: Invalid buffer...\n", __func__); - return; - } - - printf("Fill buffer content:\n"); - memset(ptr, 0xfd, len); - for (i = 0; i < len; i++) - printf("0x%x ", ptr[i]); - printf("\n"); -} - -void read_buffer(void *buffer, unsigned long len) -{ - int i; - unsigned char *ptr = (unsigned char *)buffer; - - if (!ptr) { - fprintf(stderr, "<%s>: Invalid buffer...\n", __func__); - return; - } - - printf("Read buffer content:\n"); - for (i = 0; i < len; i++) - printf("0x%x ", ptr[i]); - printf("\n"); -} - -int ion_export_buffer_fd(struct ion_buffer_info *ion_info) -{ - int i, ret, ionfd, buffer_fd; - unsigned int heap_id; - unsigned long maplen; - unsigned char *map_buffer; - struct ion_allocation_data alloc_data; - struct ion_heap_query query; - struct ion_heap_data heap_data[MAX_HEAP_COUNT]; - - if (!ion_info) { - fprintf(stderr, "<%s>: Invalid ion info\n", __func__); - return -1; - } - - /* Create an ION client */ - ionfd = open(ION_DEVICE, O_RDWR); - if (ionfd < 0) { - fprintf(stderr, "<%s>: Failed to open ion client: %s\n", - __func__, strerror(errno)); - return -1; - } - - memset(&query, 0, sizeof(query)); - query.cnt = MAX_HEAP_COUNT; - query.heaps = (unsigned long int)&heap_data[0]; - /* Query ION heap_id_mask from ION heap */ - ret = ioctl(ionfd, ION_IOC_HEAP_QUERY, &query); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed: ION_IOC_HEAP_QUERY: %s\n", - __func__, strerror(errno)); - goto err_query; - } - - heap_id = MAX_HEAP_COUNT + 1; - for (i = 0; i < query.cnt; i++) { - if (heap_data[i].type == ion_info->heap_type) { - heap_id = heap_data[i].heap_id; - break; - } - } - - if (heap_id > MAX_HEAP_COUNT) { - fprintf(stderr, "<%s>: ERROR: heap type does not exists\n", - __func__); - goto err_heap; - } - - alloc_data.len = ion_info->heap_size; - alloc_data.heap_id_mask = 1 << heap_id; - alloc_data.flags = ion_info->flag_type; - - /* Allocate memory for this ION client as per heap_type */ - ret = ioctl(ionfd, ION_IOC_ALLOC, &alloc_data); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed: ION_IOC_ALLOC: %s\n", - __func__, strerror(errno)); - goto err_alloc; - } - - /* This will return a valid buffer fd */ - buffer_fd = alloc_data.fd; - maplen = alloc_data.len; - - if (buffer_fd < 0 || maplen <= 0) { - fprintf(stderr, "<%s>: Invalid map data, fd: %d, len: %ld\n", - __func__, buffer_fd, maplen); - goto err_fd_data; - } - - /* Create memory mapped buffer for the buffer fd */ - map_buffer = (unsigned char *)mmap(NULL, maplen, PROT_READ|PROT_WRITE, - MAP_SHARED, buffer_fd, 0); - if (map_buffer == MAP_FAILED) { - fprintf(stderr, "<%s>: Failed: mmap: %s\n", - __func__, strerror(errno)); - goto err_mmap; - } - - ion_info->ionfd = ionfd; - ion_info->buffd = buffer_fd; - ion_info->buffer = map_buffer; - ion_info->buflen = maplen; - - return 0; - - munmap(map_buffer, maplen); - -err_fd_data: -err_mmap: - /* in case of error: close the buffer fd */ - if (buffer_fd) - close(buffer_fd); - -err_query: -err_heap: -err_alloc: - /* In case of error: close the ion client fd */ - if (ionfd) - close(ionfd); - - return -1; -} - -int ion_import_buffer_fd(struct ion_buffer_info *ion_info) -{ - int buffd; - unsigned char *map_buf; - unsigned long map_len; - - if (!ion_info) { - fprintf(stderr, "<%s>: Invalid ion info\n", __func__); - return -1; - } - - map_len = ion_info->buflen; - buffd = ion_info->buffd; - - if (buffd < 0 || map_len <= 0) { - fprintf(stderr, "<%s>: Invalid map data, fd: %d, len: %ld\n", - __func__, buffd, map_len); - goto err_buffd; - } - - map_buf = (unsigned char *)mmap(NULL, map_len, PROT_READ|PROT_WRITE, - MAP_SHARED, buffd, 0); - if (map_buf == MAP_FAILED) { - printf("<%s>: Failed - mmap: %s\n", - __func__, strerror(errno)); - goto err_mmap; - } - - ion_info->buffer = map_buf; - ion_info->buflen = map_len; - - return 0; - -err_mmap: - if (buffd) - close(buffd); - -err_buffd: - return -1; -} - -void ion_close_buffer_fd(struct ion_buffer_info *ion_info) -{ - if (ion_info) { - /* unmap the buffer properly in the end */ - munmap(ion_info->buffer, ion_info->buflen); - /* close the buffer fd */ - if (ion_info->buffd > 0) - close(ion_info->buffd); - /* Finally, close the client fd */ - if (ion_info->ionfd > 0) - close(ion_info->ionfd); - } -} - -int socket_send_fd(struct socket_info *info) -{ - int status; - int fd, sockfd; - struct socketdata skdata; - - if (!info) { - fprintf(stderr, "<%s>: Invalid socket info\n", __func__); - return -1; - } - - sockfd = info->sockfd; - fd = info->datafd; - memset(&skdata, 0, sizeof(skdata)); - skdata.data = fd; - skdata.len = sizeof(skdata.data); - status = sendtosocket(sockfd, &skdata); - if (status < 0) { - fprintf(stderr, "<%s>: Failed: sendtosocket\n", __func__); - return -1; - } - - return 0; -} - -int socket_receive_fd(struct socket_info *info) -{ - int status; - int fd, sockfd; - struct socketdata skdata; - - if (!info) { - fprintf(stderr, "<%s>: Invalid socket info\n", __func__); - return -1; - } - - sockfd = info->sockfd; - memset(&skdata, 0, sizeof(skdata)); - status = receivefromsocket(sockfd, &skdata); - if (status < 0) { - fprintf(stderr, "<%s>: Failed: receivefromsocket\n", __func__); - return -1; - } - - fd = (int)skdata.data; - info->datafd = fd; - - return status; -} diff --git a/tools/testing/selftests/android/ion/ionutils.h b/tools/testing/selftests/android/ion/ionutils.h deleted file mode 100644 index 9941eb858576..000000000000 --- a/tools/testing/selftests/android/ion/ionutils.h +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef __ION_UTILS_H -#define __ION_UTILS_H - -#include "ion.h" - -#define SOCKET_NAME "ion_socket" -#define ION_DEVICE "/dev/ion" - -#define ION_BUFFER_LEN 4096 -#define MAX_HEAP_COUNT ION_HEAP_TYPE_CUSTOM - -struct socket_info { - int sockfd; - int datafd; - unsigned long buflen; -}; - -struct ion_buffer_info { - int ionfd; - int buffd; - unsigned int heap_type; - unsigned int flag_type; - unsigned long heap_size; - unsigned long buflen; - unsigned char *buffer; -}; - - -/* This is used to fill the data into the mapped buffer */ -void write_buffer(void *buffer, unsigned long len); - -/* This is used to read the data from the exported buffer */ -void read_buffer(void *buffer, unsigned long len); - -/* This is used to create an ION buffer FD for the kernel buffer - * So you can export this same buffer to others in the form of FD - */ -int ion_export_buffer_fd(struct ion_buffer_info *ion_info); - -/* This is used to import or map an exported FD. - * So we point to same buffer without making a copy. Hence zero-copy. - */ -int ion_import_buffer_fd(struct ion_buffer_info *ion_info); - -/* This is used to close all references for the ION client */ -void ion_close_buffer_fd(struct ion_buffer_info *ion_info); - -/* This is used to send FD to another process using socket IPC */ -int socket_send_fd(struct socket_info *skinfo); - -/* This is used to receive FD from another process using socket IPC */ -int socket_receive_fd(struct socket_info *skinfo); - - -#endif diff --git a/tools/testing/selftests/android/ion/ipcsocket.c b/tools/testing/selftests/android/ion/ipcsocket.c deleted file mode 100644 index 7dc521002095..000000000000 --- a/tools/testing/selftests/android/ion/ipcsocket.c +++ /dev/null @@ -1,227 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <sys/time.h> -#include <sys/un.h> -#include <errno.h> - -#include "ipcsocket.h" - - -int opensocket(int *sockfd, const char *name, int connecttype) -{ - int ret, temp = 1; - - if (!name || strlen(name) > MAX_SOCK_NAME_LEN) { - fprintf(stderr, "<%s>: Invalid socket name.\n", __func__); - return -1; - } - - ret = socket(PF_LOCAL, SOCK_STREAM, 0); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed socket: <%s>\n", - __func__, strerror(errno)); - return ret; - } - - *sockfd = ret; - if (setsockopt(*sockfd, SOL_SOCKET, SO_REUSEADDR, - (char *)&temp, sizeof(int)) < 0) { - fprintf(stderr, "<%s>: Failed setsockopt: <%s>\n", - __func__, strerror(errno)); - goto err; - } - - sprintf(sock_name, "/tmp/%s", name); - - if (connecttype == 1) { - /* This is for Server connection */ - struct sockaddr_un skaddr; - int clientfd; - socklen_t sklen; - - unlink(sock_name); - memset(&skaddr, 0, sizeof(skaddr)); - skaddr.sun_family = AF_LOCAL; - strcpy(skaddr.sun_path, sock_name); - - ret = bind(*sockfd, (struct sockaddr *)&skaddr, - SUN_LEN(&skaddr)); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed bind: <%s>\n", - __func__, strerror(errno)); - goto err; - } - - ret = listen(*sockfd, 5); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed listen: <%s>\n", - __func__, strerror(errno)); - goto err; - } - - memset(&skaddr, 0, sizeof(skaddr)); - sklen = sizeof(skaddr); - - ret = accept(*sockfd, (struct sockaddr *)&skaddr, - (socklen_t *)&sklen); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed accept: <%s>\n", - __func__, strerror(errno)); - goto err; - } - - clientfd = ret; - *sockfd = clientfd; - } else { - /* This is for client connection */ - struct sockaddr_un skaddr; - - memset(&skaddr, 0, sizeof(skaddr)); - skaddr.sun_family = AF_LOCAL; - strcpy(skaddr.sun_path, sock_name); - - ret = connect(*sockfd, (struct sockaddr *)&skaddr, - SUN_LEN(&skaddr)); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed connect: <%s>\n", - __func__, strerror(errno)); - goto err; - } - } - - return 0; - -err: - if (*sockfd) - close(*sockfd); - - return ret; -} - -int sendtosocket(int sockfd, struct socketdata *skdata) -{ - int ret, buffd; - unsigned int len; - char cmsg_b[CMSG_SPACE(sizeof(int))]; - struct cmsghdr *cmsg; - struct msghdr msgh; - struct iovec iov; - struct timeval timeout; - fd_set selFDs; - - if (!skdata) { - fprintf(stderr, "<%s>: socketdata is NULL\n", __func__); - return -1; - } - - FD_ZERO(&selFDs); - FD_SET(0, &selFDs); - FD_SET(sockfd, &selFDs); - timeout.tv_sec = 20; - timeout.tv_usec = 0; - - ret = select(sockfd+1, NULL, &selFDs, NULL, &timeout); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed select: <%s>\n", - __func__, strerror(errno)); - return -1; - } - - if (FD_ISSET(sockfd, &selFDs)) { - buffd = skdata->data; - len = skdata->len; - memset(&msgh, 0, sizeof(msgh)); - msgh.msg_control = &cmsg_b; - msgh.msg_controllen = CMSG_LEN(len); - iov.iov_base = "OK"; - iov.iov_len = 2; - msgh.msg_iov = &iov; - msgh.msg_iovlen = 1; - cmsg = CMSG_FIRSTHDR(&msgh); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(len); - memcpy(CMSG_DATA(cmsg), &buffd, len); - - ret = sendmsg(sockfd, &msgh, MSG_DONTWAIT); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed sendmsg: <%s>\n", - __func__, strerror(errno)); - return -1; - } - } - - return 0; -} - -int receivefromsocket(int sockfd, struct socketdata *skdata) -{ - int ret, buffd; - unsigned int len = 0; - char cmsg_b[CMSG_SPACE(sizeof(int))]; - struct cmsghdr *cmsg; - struct msghdr msgh; - struct iovec iov; - fd_set recvFDs; - char data[32]; - - if (!skdata) { - fprintf(stderr, "<%s>: socketdata is NULL\n", __func__); - return -1; - } - - FD_ZERO(&recvFDs); - FD_SET(0, &recvFDs); - FD_SET(sockfd, &recvFDs); - - ret = select(sockfd+1, &recvFDs, NULL, NULL, NULL); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed select: <%s>\n", - __func__, strerror(errno)); - return -1; - } - - if (FD_ISSET(sockfd, &recvFDs)) { - len = sizeof(buffd); - memset(&msgh, 0, sizeof(msgh)); - msgh.msg_control = &cmsg_b; - msgh.msg_controllen = CMSG_LEN(len); - iov.iov_base = data; - iov.iov_len = sizeof(data)-1; - msgh.msg_iov = &iov; - msgh.msg_iovlen = 1; - cmsg = CMSG_FIRSTHDR(&msgh); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(len); - - ret = recvmsg(sockfd, &msgh, MSG_DONTWAIT); - if (ret < 0) { - fprintf(stderr, "<%s>: Failed recvmsg: <%s>\n", - __func__, strerror(errno)); - return -1; - } - - memcpy(&buffd, CMSG_DATA(cmsg), len); - skdata->data = buffd; - skdata->len = len; - } - return 0; -} - -int closesocket(int sockfd, char *name) -{ - char sockname[MAX_SOCK_NAME_LEN]; - - if (sockfd) - close(sockfd); - sprintf(sockname, "/tmp/%s", name); - unlink(sockname); - shutdown(sockfd, 2); - - return 0; -} diff --git a/tools/testing/selftests/android/ion/ipcsocket.h b/tools/testing/selftests/android/ion/ipcsocket.h deleted file mode 100644 index b3e84498a8a1..000000000000 --- a/tools/testing/selftests/android/ion/ipcsocket.h +++ /dev/null @@ -1,35 +0,0 @@ - -#ifndef _IPCSOCKET_H -#define _IPCSOCKET_H - - -#define MAX_SOCK_NAME_LEN 64 - -char sock_name[MAX_SOCK_NAME_LEN]; - -/* This structure is responsible for holding the IPC data - * data: hold the buffer fd - * len: just the length of 32-bit integer fd - */ -struct socketdata { - int data; - unsigned int len; -}; - -/* This API is used to open the IPC socket connection - * name: implies a unique socket name in the system - * connecttype: implies server(0) or client(1) - */ -int opensocket(int *sockfd, const char *name, int connecttype); - -/* This is the API to send socket data over IPC socket */ -int sendtosocket(int sockfd, struct socketdata *data); - -/* This is the API to receive socket data over IPC socket */ -int receivefromsocket(int sockfd, struct socketdata *data); - -/* This is the API to close the socket connection */ -int closesocket(int sockfd, char *name); - - -#endif diff --git a/tools/testing/selftests/android/run.sh b/tools/testing/selftests/android/run.sh deleted file mode 100755 index dd8edf291454..000000000000 --- a/tools/testing/selftests/android/run.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh - -(cd ion; ./ion_test.sh) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 3ab1200e172f..f5b7ef93618c 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -8,7 +8,6 @@ FEATURE-DUMP.libbpf fixdep test_dev_cgroup /test_progs* -test_tcpbpf_user test_verifier_log feature test_sock @@ -36,3 +35,5 @@ test_cpp /tools /runqslower /bench +*.ko +xdpxceiver diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 542768f5195b..8c33e999319a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -19,7 +19,6 @@ ifneq ($(wildcard $(GENHDR)),) endif CLANG ?= clang -LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) SAN_CFLAGS ?= @@ -32,7 +31,7 @@ LDLIBS += -lcap -lelf -lz -lrt -lpthread # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_verifier_log test_dev_cgroup test_tcpbpf_user \ + test_verifier_log test_dev_cgroup \ test_sock test_sockmap get_cgroup_id_user test_socket_cookie \ test_cgroup_storage \ test_netcnt test_tcpnotify_user test_sysctl \ @@ -46,7 +45,8 @@ endif TEST_GEN_FILES = TEST_FILES = test_lwt_ip_encap.o \ - test_tc_edt.o + test_tc_edt.o \ + xsk_prereqs.sh # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ @@ -70,17 +70,17 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_metadata.sh \ + test_xsk.sh TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ - tcp_client.py \ - tcp_server.py \ test_xdp_vlan.sh # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ - test_lirc_mode2_user xdping test_cpp runqslower bench + test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ + xdpxceiver TEST_CUSTOM_PROGS = urandom_read @@ -104,6 +104,7 @@ OVERRIDE_TARGETS := 1 override define CLEAN $(call msg,CLEAN) $(Q)$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) + $(Q)$(MAKE) -C bpf_testmod clean endef include ../lib.mk @@ -114,6 +115,13 @@ INCLUDE_DIR := $(SCRATCH_DIR)/include BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a RESOLVE_BTFIDS := $(BUILD_DIR)/resolve_btfids/resolve_btfids +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + # Define simple and short `make test_progs`, `make test_sysctl`, etc targets # to build individual tests. # NOTE: Semicolon at the end is critical to override lib.mk's default static @@ -136,17 +144,16 @@ $(OUTPUT)/urandom_read: urandom_read.c $(call msg,BINARY,,$@) $(Q)$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS) -Wl,--build-id=sha1 +$(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch]) + $(call msg,MOD,,$@) + $(Q)$(RM) bpf_testmod/bpf_testmod.ko # force re-compilation + $(Q)$(MAKE) $(submake_extras) -C bpf_testmod + $(Q)cp bpf_testmod/bpf_testmod.ko $@ + $(OUTPUT)/test_stub.o: test_stub.c $(BPFOBJ) $(call msg,CC,,$@) $(Q)$(CC) -c $(CFLAGS) -o $@ $< -VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ - $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ - ../../../../vmlinux \ - /sys/kernel/btf/vmlinux \ - /boot/vmlinux-$(shell uname -r) -VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) - DEFAULT_BPFTOOL := $(SCRATCH_DIR)/sbin/bpftool $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) @@ -163,7 +170,6 @@ $(OUTPUT)/test_sock: cgroup_helpers.c $(OUTPUT)/test_sock_addr: cgroup_helpers.c $(OUTPUT)/test_socket_cookie: cgroup_helpers.c $(OUTPUT)/test_sockmap: cgroup_helpers.c -$(OUTPUT)/test_tcpbpf_user: cgroup_helpers.c $(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c $(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c $(OUTPUT)/test_cgroup_storage: cgroup_helpers.c @@ -220,7 +226,8 @@ $(RESOLVE_BTFIDS): $(BPFOBJ) | $(BUILD_DIR)/resolve_btfids \ # build would have failed anyways. define get_sys_includes $(shell $(1) -v -E - </dev/null 2>&1 \ - | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - </dev/null | grep '#define __riscv_xlen ' | sed 's/#define /-D/' | sed 's/ /=/') endef # Determine target endianness. @@ -245,31 +252,19 @@ $(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h # $1 - input .c file # $2 - output .o file # $3 - CFLAGS -# $4 - LDFLAGS define CLANG_BPF_BUILD_RULE - $(call msg,CLNG-LLC,$(TRUNNER_BINARY),$2) - $(Q)($(CLANG) $3 -O2 -target bpf -emit-llvm \ - -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -mattr=dwarfris -march=bpf -mcpu=v3 $4 -filetype=obj -o $2 + $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) + $(Q)$(CLANG) $3 -O2 -target bpf -c $1 -o $2 -mcpu=v3 endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE - $(call msg,CLNG-LLC,$(TRUNNER_BINARY),$2) - $(Q)($(CLANG) $3 -O2 -target bpf -emit-llvm \ - -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -march=bpf -mcpu=v2 $4 -filetype=obj -o $2 -endef -# Similar to CLANG_BPF_BUILD_RULE, but using native Clang and bpf LLC -define CLANG_NATIVE_BPF_BUILD_RULE $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) - $(Q)($(CLANG) $3 -O2 -emit-llvm \ - -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -march=bpf -mcpu=v3 $4 -filetype=obj -o $2 + $(Q)$(CLANG) $3 -O2 -target bpf -c $1 -o $2 -mcpu=v2 endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) - $(Q)$(BPF_GCC) $3 $4 -O2 -c $1 -o $2 + $(Q)$(BPF_GCC) $3 -O2 -c $1 -o $2 endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c @@ -324,8 +319,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(INCLUDE_DIR)/vmlinux.h \ $(wildcard $(BPFDIR)/bpf_*.h) | $(TRUNNER_OUTPUT) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ - $(TRUNNER_BPF_CFLAGS), \ - $(TRUNNER_BPF_LDFLAGS)) + $(TRUNNER_BPF_CFLAGS)) $(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h: \ $(TRUNNER_OUTPUT)/%.o \ @@ -378,7 +372,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ | $(TRUNNER_BINARY)-extras $$(call msg,BINARY,,$$@) $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@ - $(Q)$(RESOLVE_BTFIDS) --no-fail --btf btf_data.o $$@ + $(Q)$(RESOLVE_BTFIDS) --no-fail --btf $(TRUNNER_OUTPUT)/btf_data.o $$@ endef @@ -387,24 +381,22 @@ TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ network_helpers.c testing_helpers.c \ - flow_dissector_load.h -TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ + btf_helpers.c flow_dissector_load.h +TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ + ima_setup.sh \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -TRUNNER_BPF_LDFLAGS := -mattr=+alu32 $(eval $(call DEFINE_TEST_RUNNER,test_progs)) # Define test_progs-no_alu32 test runner. TRUNNER_BPF_BUILD_RULE := CLANG_NOALU32_BPF_BUILD_RULE -TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) # Define test_progs BPF-GCC-flavored test runner. ifneq ($(BPF_GCC),) TRUNNER_BPF_BUILD_RULE := GCC_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(call get_sys_includes,gcc) -TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_progs,bpf_gcc)) endif @@ -415,7 +407,6 @@ TRUNNER_EXTRA_SOURCES := test_maps.c TRUNNER_EXTRA_FILES := TRUNNER_BPF_BUILD_RULE := $$(error no BPF objects should be built) TRUNNER_BPF_CFLAGS := -TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_maps)) # Define test_verifier test runner. @@ -459,4 +450,4 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ - $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc) + $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko) diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index ac9eda830187..ca064180d4d0 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -2,7 +2,10 @@ BPF Selftest Notes ================== General instructions on running selftests can be found in -`Documentation/bpf/bpf_devel_QA.rst`_. +`Documentation/bpf/bpf_devel_QA.rst`__. + +__ /Documentation/bpf/bpf_devel_QA.rst#q-how-to-run-bpf-selftests + Additional information about selftest failures are documented here. @@ -30,11 +33,12 @@ The verifier will reject such code with above error. At insn 18 the r7 is indeed unbounded. The later insn 19 checks the bounds and the insn 20 undoes map_value addition. It is currently impossible for the verifier to understand such speculative pointer arithmetic. -Hence - https://reviews.llvm.org/D85570 -addresses it on the compiler side. It was committed on llvm 12. +Hence `this patch`__ addresses it on the compiler side. It was committed on llvm 12. + +__ https://reviews.llvm.org/D85570 The corresponding C code + .. code-block:: c for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) { @@ -77,10 +81,11 @@ The symptom for ``bpf_iter/netlink`` looks like 17: (7b) *(u64 *)(r7 +0) = r2 only read is supported -This is due to a llvm BPF backend bug. The fix - https://reviews.llvm.org/D78466 +This is due to a llvm BPF backend bug. `The fix`__ has been pushed to llvm 10.x release branch and will be -available in 10.0.1. The fix is available in llvm 11.0.0 trunk. +available in 10.0.1. The patch is available in llvm 11.0.0 trunk. + +__ https://reviews.llvm.org/D78466 BPF CO-RE-based tests and Clang version ======================================= @@ -94,11 +99,11 @@ them to Clang/LLVM. These sub-tests are going to be skipped if Clang is too old to support them, they shouldn't cause build failures or runtime test failures: - - __builtin_btf_type_id() ([0], [1], [2]); - - __builtin_preserve_type_info(), __builtin_preserve_enum_value() ([3], [4]). +- __builtin_btf_type_id() [0_, 1_, 2_]; +- __builtin_preserve_type_info(), __builtin_preserve_enum_value() [3_, 4_]. - [0] https://reviews.llvm.org/D74572 - [1] https://reviews.llvm.org/D74668 - [2] https://reviews.llvm.org/D85174 - [3] https://reviews.llvm.org/D83878 - [4] https://reviews.llvm.org/D83242 +.. _0: https://reviews.llvm.org/D74572 +.. _1: https://reviews.llvm.org/D74668 +.. _2: https://reviews.llvm.org/D85174 +.. _3: https://reviews.llvm.org/D83878 +.. _4: https://reviews.llvm.org/D83242 diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 2915664c335d..6a9053162cf2 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -56,6 +56,7 @@ struct tcp_sock { __u32 rcv_nxt; __u32 snd_nxt; __u32 snd_una; + __u32 window_clamp; __u8 ecn_flags; __u32 delivered; __u32 delivered_ce; diff --git a/tools/testing/selftests/bpf/bpf_testmod/.gitignore b/tools/testing/selftests/bpf/bpf_testmod/.gitignore new file mode 100644 index 000000000000..ded513777281 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_testmod/.gitignore @@ -0,0 +1,6 @@ +*.mod +*.mod.c +*.o +.ko +/Module.symvers +/modules.order diff --git a/tools/testing/selftests/bpf/bpf_testmod/Makefile b/tools/testing/selftests/bpf/bpf_testmod/Makefile new file mode 100644 index 000000000000..15cb36c4483a --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_testmod/Makefile @@ -0,0 +1,20 @@ +BPF_TESTMOD_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) +KDIR ?= $(abspath $(BPF_TESTMOD_DIR)/../../../../..) + +ifeq ($(V),1) +Q = +else +Q = @ +endif + +MODULES = bpf_testmod.ko + +obj-m += bpf_testmod.o +CFLAGS_bpf_testmod.o = -I$(src) + +all: + +$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) modules + +clean: + +$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) clean + diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h new file mode 100644 index 000000000000..b83ea448bc79 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf_testmod + +#if !defined(_BPF_TESTMOD_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _BPF_TESTMOD_EVENTS_H + +#include <linux/tracepoint.h> +#include "bpf_testmod.h" + +TRACE_EVENT(bpf_testmod_test_read, + TP_PROTO(struct task_struct *task, struct bpf_testmod_test_read_ctx *ctx), + TP_ARGS(task, ctx), + TP_STRUCT__entry( + __field(pid_t, pid) + __array(char, comm, TASK_COMM_LEN) + __field(loff_t, off) + __field(size_t, len) + ), + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->off = ctx->off; + __entry->len = ctx->len; + ), + TP_printk("pid=%d comm=%s off=%llu len=%zu", + __entry->pid, __entry->comm, __entry->off, __entry->len) +); + +#endif /* _BPF_TESTMOD_EVENTS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE bpf_testmod-events +#include <trace/define_trace.h> diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c new file mode 100644 index 000000000000..2df19d73ca49 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <linux/error-injection.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sysfs.h> +#include <linux/tracepoint.h> +#include "bpf_testmod.h" + +#define CREATE_TRACE_POINTS +#include "bpf_testmod-events.h" + +noinline ssize_t +bpf_testmod_test_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + struct bpf_testmod_test_read_ctx ctx = { + .buf = buf, + .off = off, + .len = len, + }; + + trace_bpf_testmod_test_read(current, &ctx); + + return -EIO; /* always fail */ +} +EXPORT_SYMBOL(bpf_testmod_test_read); +ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO); + +static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { + .attr = { .name = "bpf_testmod", .mode = 0444, }, + .read = bpf_testmod_test_read, +}; + +static int bpf_testmod_init(void) +{ + return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); +} + +static void bpf_testmod_exit(void) +{ + return sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); +} + +module_init(bpf_testmod_init); +module_exit(bpf_testmod_exit); + +MODULE_AUTHOR("Andrii Nakryiko"); +MODULE_DESCRIPTION("BPF selftests module"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h new file mode 100644 index 000000000000..b81adfedb4f6 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#ifndef _BPF_TESTMOD_H +#define _BPF_TESTMOD_H + +#include <linux/types.h> + +struct bpf_testmod_test_read_ctx { + char *buf; + loff_t off; + size_t len; +}; + +#endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c new file mode 100644 index 000000000000..48f90490f922 --- /dev/null +++ b/tools/testing/selftests/bpf/btf_helpers.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <stdio.h> +#include <errno.h> +#include <bpf/btf.h> +#include <bpf/libbpf.h> +#include "test_progs.h" + +static const char * const btf_kind_str_mapping[] = { + [BTF_KIND_UNKN] = "UNKNOWN", + [BTF_KIND_INT] = "INT", + [BTF_KIND_PTR] = "PTR", + [BTF_KIND_ARRAY] = "ARRAY", + [BTF_KIND_STRUCT] = "STRUCT", + [BTF_KIND_UNION] = "UNION", + [BTF_KIND_ENUM] = "ENUM", + [BTF_KIND_FWD] = "FWD", + [BTF_KIND_TYPEDEF] = "TYPEDEF", + [BTF_KIND_VOLATILE] = "VOLATILE", + [BTF_KIND_CONST] = "CONST", + [BTF_KIND_RESTRICT] = "RESTRICT", + [BTF_KIND_FUNC] = "FUNC", + [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + [BTF_KIND_VAR] = "VAR", + [BTF_KIND_DATASEC] = "DATASEC", +}; + +static const char *btf_kind_str(__u16 kind) +{ + if (kind > BTF_KIND_DATASEC) + return "UNKNOWN"; + return btf_kind_str_mapping[kind]; +} + +static const char *btf_int_enc_str(__u8 encoding) +{ + switch (encoding) { + case 0: + return "(none)"; + case BTF_INT_SIGNED: + return "SIGNED"; + case BTF_INT_CHAR: + return "CHAR"; + case BTF_INT_BOOL: + return "BOOL"; + default: + return "UNKN"; + } +} + +static const char *btf_var_linkage_str(__u32 linkage) +{ + switch (linkage) { + case BTF_VAR_STATIC: + return "static"; + case BTF_VAR_GLOBAL_ALLOCATED: + return "global-alloc"; + default: + return "(unknown)"; + } +} + +static const char *btf_func_linkage_str(const struct btf_type *t) +{ + switch (btf_vlen(t)) { + case BTF_FUNC_STATIC: + return "static"; + case BTF_FUNC_GLOBAL: + return "global"; + case BTF_FUNC_EXTERN: + return "extern"; + default: + return "(unknown)"; + } +} + +static const char *btf_str(const struct btf *btf, __u32 off) +{ + if (!off) + return "(anon)"; + return btf__str_by_offset(btf, off) ?: "(invalid)"; +} + +int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id) +{ + const struct btf_type *t; + int kind, i; + __u32 vlen; + + t = btf__type_by_id(btf, id); + if (!t) + return -EINVAL; + + vlen = btf_vlen(t); + kind = btf_kind(t); + + fprintf(out, "[%u] %s '%s'", id, btf_kind_str(kind), btf_str(btf, t->name_off)); + + switch (kind) { + case BTF_KIND_INT: + fprintf(out, " size=%u bits_offset=%u nr_bits=%u encoding=%s", + t->size, btf_int_offset(t), btf_int_bits(t), + btf_int_enc_str(btf_int_encoding(t))); + break; + case BTF_KIND_PTR: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPEDEF: + fprintf(out, " type_id=%u", t->type); + break; + case BTF_KIND_ARRAY: { + const struct btf_array *arr = btf_array(t); + + fprintf(out, " type_id=%u index_type_id=%u nr_elems=%u", + arr->type, arr->index_type, arr->nelems); + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + + fprintf(out, " size=%u vlen=%u", t->size, vlen); + for (i = 0; i < vlen; i++, m++) { + __u32 bit_off, bit_sz; + + bit_off = btf_member_bit_offset(t, i); + bit_sz = btf_member_bitfield_size(t, i); + fprintf(out, "\n\t'%s' type_id=%u bits_offset=%u", + btf_str(btf, m->name_off), m->type, bit_off); + if (bit_sz) + fprintf(out, " bitfield_size=%u", bit_sz); + } + break; + } + case BTF_KIND_ENUM: { + const struct btf_enum *v = btf_enum(t); + + fprintf(out, " size=%u vlen=%u", t->size, vlen); + for (i = 0; i < vlen; i++, v++) { + fprintf(out, "\n\t'%s' val=%u", + btf_str(btf, v->name_off), v->val); + } + break; + } + case BTF_KIND_FWD: + fprintf(out, " fwd_kind=%s", btf_kflag(t) ? "union" : "struct"); + break; + case BTF_KIND_FUNC: + fprintf(out, " type_id=%u linkage=%s", t->type, btf_func_linkage_str(t)); + break; + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + + fprintf(out, " ret_type_id=%u vlen=%u", t->type, vlen); + for (i = 0; i < vlen; i++, p++) { + fprintf(out, "\n\t'%s' type_id=%u", + btf_str(btf, p->name_off), p->type); + } + break; + } + case BTF_KIND_VAR: + fprintf(out, " type_id=%u, linkage=%s", + t->type, btf_var_linkage_str(btf_var(t)->linkage)); + break; + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *v = btf_var_secinfos(t); + + fprintf(out, " size=%u vlen=%u", t->size, vlen); + for (i = 0; i < vlen; i++, v++) { + fprintf(out, "\n\ttype_id=%u offset=%u size=%u", + v->type, v->offset, v->size); + } + break; + } + default: + break; + } + + return 0; +} + +/* Print raw BTF type dump into a local buffer and return string pointer back. + * Buffer *will* be overwritten by subsequent btf_type_raw_dump() calls + */ +const char *btf_type_raw_dump(const struct btf *btf, int type_id) +{ + static char buf[16 * 1024]; + FILE *buf_file; + + buf_file = fmemopen(buf, sizeof(buf) - 1, "w"); + if (!buf_file) { + fprintf(stderr, "Failed to open memstream: %d\n", errno); + return NULL; + } + + fprintf_btf_type_raw(buf_file, btf, type_id); + fflush(buf_file); + fclose(buf_file); + + return buf; +} + +int btf_validate_raw(struct btf *btf, int nr_types, const char *exp_types[]) +{ + int i; + bool ok = true; + + ASSERT_EQ(btf__get_nr_types(btf), nr_types, "btf_nr_types"); + + for (i = 1; i <= nr_types; i++) { + if (!ASSERT_STREQ(btf_type_raw_dump(btf, i), exp_types[i - 1], "raw_dump")) + ok = false; + } + + return ok; +} + +static void btf_dump_printf(void *ctx, const char *fmt, va_list args) +{ + vfprintf(ctx, fmt, args); +} + +/* Print BTF-to-C dump into a local buffer and return string pointer back. + * Buffer *will* be overwritten by subsequent btf_type_raw_dump() calls + */ +const char *btf_type_c_dump(const struct btf *btf) +{ + static char buf[16 * 1024]; + FILE *buf_file; + struct btf_dump *d = NULL; + struct btf_dump_opts opts = {}; + int err, i; + + buf_file = fmemopen(buf, sizeof(buf) - 1, "w"); + if (!buf_file) { + fprintf(stderr, "Failed to open memstream: %d\n", errno); + return NULL; + } + + opts.ctx = buf_file; + d = btf_dump__new(btf, NULL, &opts, btf_dump_printf); + if (libbpf_get_error(d)) { + fprintf(stderr, "Failed to create btf_dump instance: %ld\n", libbpf_get_error(d)); + return NULL; + } + + for (i = 1; i <= btf__get_nr_types(btf); i++) { + err = btf_dump__dump_type(d, i); + if (err) { + fprintf(stderr, "Failed to dump type [%d]: %d\n", i, err); + return NULL; + } + } + + fflush(buf_file); + fclose(buf_file); + return buf; +} diff --git a/tools/testing/selftests/bpf/btf_helpers.h b/tools/testing/selftests/bpf/btf_helpers.h new file mode 100644 index 000000000000..295c0137d9bd --- /dev/null +++ b/tools/testing/selftests/bpf/btf_helpers.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#ifndef __BTF_HELPERS_H +#define __BTF_HELPERS_H + +#include <stdio.h> +#include <bpf/btf.h> + +int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id); +const char *btf_type_raw_dump(const struct btf *btf, int type_id); +int btf_validate_raw(struct btf *btf, int nr_types, const char *exp_types[]); + +#define VALIDATE_RAW_BTF(btf, raw_types...) \ + btf_validate_raw(btf, \ + sizeof((const char *[]){raw_types})/sizeof(void *),\ + (const char *[]){raw_types}) + +const char *btf_type_c_dump(const struct btf *btf); +#endif diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 2118e23ac07a..37e1f303fc11 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -39,3 +39,8 @@ CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y CONFIG_LIRC=y +CONFIG_IMA=y +CONFIG_SECURITYFS=y +CONFIG_IMA_WRITE_POLICY=y +CONFIG_IMA_READ_POLICY=y +CONFIG_BLK_DEV_LOOP=y diff --git a/tools/testing/selftests/bpf/ima_setup.sh b/tools/testing/selftests/bpf/ima_setup.sh new file mode 100755 index 000000000000..8e62581113a3 --- /dev/null +++ b/tools/testing/selftests/bpf/ima_setup.sh @@ -0,0 +1,123 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e +set -u +set -o pipefail + +IMA_POLICY_FILE="/sys/kernel/security/ima/policy" +TEST_BINARY="/bin/true" +VERBOSE="${SELFTESTS_VERBOSE:=0}" +LOG_FILE="$(mktemp /tmp/ima_setup.XXXX.log)" + +usage() +{ + echo "Usage: $0 <setup|cleanup|run> <existing_tmp_dir>" + exit 1 +} + +ensure_mount_securityfs() +{ + local securityfs_dir=$(grep "securityfs" /proc/mounts | awk '{print $2}') + + if [ -z "${securityfs_dir}" ]; then + securityfs_dir=/sys/kernel/security + mount -t securityfs security "${securityfs_dir}" + fi + + if [ ! -d "${securityfs_dir}" ]; then + echo "${securityfs_dir}: securityfs is not mounted" && exit 1 + fi +} + +setup() +{ + local tmp_dir="$1" + local mount_img="${tmp_dir}/test.img" + local mount_dir="${tmp_dir}/mnt" + local copied_bin_path="${mount_dir}/$(basename ${TEST_BINARY})" + mkdir -p ${mount_dir} + + dd if=/dev/zero of="${mount_img}" bs=1M count=10 + + losetup -f "${mount_img}" + local loop_device=$(losetup -a | grep ${mount_img:?} | cut -d ":" -f1) + + mkfs.ext2 "${loop_device:?}" + mount "${loop_device}" "${mount_dir}" + + cp "${TEST_BINARY}" "${mount_dir}" + local mount_uuid="$(blkid ${loop_device} | sed 's/.*UUID="\([^"]*\)".*/\1/')" + + ensure_mount_securityfs + echo "measure func=BPRM_CHECK fsuuid=${mount_uuid}" > ${IMA_POLICY_FILE} +} + +cleanup() { + local tmp_dir="$1" + local mount_img="${tmp_dir}/test.img" + local mount_dir="${tmp_dir}/mnt" + + local loop_devices=$(losetup -a | grep ${mount_img:?} | cut -d ":" -f1) + + for loop_dev in "${loop_devices}"; do + losetup -d $loop_dev + done + + umount ${mount_dir} + rm -rf ${tmp_dir} +} + +run() +{ + local tmp_dir="$1" + local mount_dir="${tmp_dir}/mnt" + local copied_bin_path="${mount_dir}/$(basename ${TEST_BINARY})" + + exec "${copied_bin_path}" +} + +catch() +{ + local exit_code="$1" + local log_file="$2" + + if [[ "${exit_code}" -ne 0 ]]; then + cat "${log_file}" >&3 + fi + + rm -f "${log_file}" + exit ${exit_code} +} + +main() +{ + [[ $# -ne 2 ]] && usage + + local action="$1" + local tmp_dir="$2" + + [[ ! -d "${tmp_dir}" ]] && echo "Directory ${tmp_dir} doesn't exist" && exit 1 + + if [[ "${action}" == "setup" ]]; then + setup "${tmp_dir}" + elif [[ "${action}" == "cleanup" ]]; then + cleanup "${tmp_dir}" + elif [[ "${action}" == "run" ]]; then + run "${tmp_dir}" + else + echo "Unknown action: ${action}" + exit 1 + fi +} + +trap 'catch "$?" "${LOG_FILE}"' EXIT + +if [[ "${VERBOSE}" -eq 0 ]]; then + # Save the stderr to 3 so that we can output back to + # it incase of an error. + exec 3>&2 1>"${LOG_FILE}" 2>&1 +fi + +main "$@" +rm -f "${LOG_FILE}" diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 52414058a627..5861446d0777 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -456,10 +456,10 @@ static struct bpf_align_test tests[] = { */ {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ - {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, - {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, + {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -467,7 +467,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, } }, { diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 448885b95eed..0e586368948d 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -20,6 +20,7 @@ #include "bpf_iter_bpf_percpu_hash_map.skel.h" #include "bpf_iter_bpf_array_map.skel.h" #include "bpf_iter_bpf_percpu_array_map.skel.h" +#include "bpf_iter_bpf_sk_storage_helpers.skel.h" #include "bpf_iter_bpf_sk_storage_map.skel.h" #include "bpf_iter_test_kern5.skel.h" #include "bpf_iter_test_kern6.skel.h" @@ -913,6 +914,119 @@ out: bpf_iter_bpf_percpu_array_map__destroy(skel); } +/* An iterator program deletes all local storage in a map. */ +static void test_bpf_sk_storage_delete(void) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + struct bpf_iter_bpf_sk_storage_helpers *skel; + union bpf_iter_link_info linfo; + int err, len, map_fd, iter_fd; + struct bpf_link *link; + int sock_fd = -1; + __u32 val = 42; + char buf[64]; + + skel = bpf_iter_bpf_sk_storage_helpers__open_and_load(); + if (CHECK(!skel, "bpf_iter_bpf_sk_storage_helpers__open_and_load", + "skeleton open_and_load failed\n")) + return; + + map_fd = bpf_map__fd(skel->maps.sk_stg_map); + + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (CHECK(sock_fd < 0, "socket", "errno: %d\n", errno)) + goto out; + err = bpf_map_update_elem(map_fd, &sock_fd, &val, BPF_NOEXIST); + if (CHECK(err, "map_update", "map_update failed\n")) + goto out; + + memset(&linfo, 0, sizeof(linfo)); + linfo.map.map_fd = map_fd; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + link = bpf_program__attach_iter(skel->progs.delete_bpf_sk_storage_map, + &opts); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto out; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + /* do some tests */ + while ((len = read(iter_fd, buf, sizeof(buf))) > 0) + ; + if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno))) + goto close_iter; + + /* test results */ + err = bpf_map_lookup_elem(map_fd, &sock_fd, &val); + if (CHECK(!err || errno != ENOENT, "bpf_map_lookup_elem", + "map value wasn't deleted (err=%d, errno=%d)\n", err, errno)) + goto close_iter; + +close_iter: + close(iter_fd); +free_link: + bpf_link__destroy(link); +out: + if (sock_fd >= 0) + close(sock_fd); + bpf_iter_bpf_sk_storage_helpers__destroy(skel); +} + +/* This creates a socket and its local storage. It then runs a task_iter BPF + * program that replaces the existing socket local storage with the tgid of the + * only task owning a file descriptor to this socket, this process, prog_tests. + * It then runs a tcp socket iterator that negates the value in the existing + * socket local storage, the test verifies that the resulting value is -pid. + */ +static void test_bpf_sk_storage_get(void) +{ + struct bpf_iter_bpf_sk_storage_helpers *skel; + int err, map_fd, val = -1; + int sock_fd = -1; + + skel = bpf_iter_bpf_sk_storage_helpers__open_and_load(); + if (CHECK(!skel, "bpf_iter_bpf_sk_storage_helpers__open_and_load", + "skeleton open_and_load failed\n")) + return; + + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (CHECK(sock_fd < 0, "socket", "errno: %d\n", errno)) + goto out; + + err = listen(sock_fd, 1); + if (CHECK(err != 0, "listen", "errno: %d\n", errno)) + goto close_socket; + + map_fd = bpf_map__fd(skel->maps.sk_stg_map); + + err = bpf_map_update_elem(map_fd, &sock_fd, &val, BPF_NOEXIST); + if (CHECK(err, "bpf_map_update_elem", "map_update_failed\n")) + goto close_socket; + + do_dummy_read(skel->progs.fill_socket_owner); + + err = bpf_map_lookup_elem(map_fd, &sock_fd, &val); + if (CHECK(err || val != getpid(), "bpf_map_lookup_elem", + "map value wasn't set correctly (expected %d, got %d, err=%d)\n", + getpid(), val, err)) + goto close_socket; + + do_dummy_read(skel->progs.negate_socket_local_storage); + + err = bpf_map_lookup_elem(map_fd, &sock_fd, &val); + CHECK(err || val != -getpid(), "bpf_map_lookup_elem", + "map value wasn't set correctly (expected %d, got %d, err=%d)\n", + -getpid(), val, err); + +close_socket: + close(sock_fd); +out: + bpf_iter_bpf_sk_storage_helpers__destroy(skel); +} + static void test_bpf_sk_storage_map(void) { DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); @@ -1067,6 +1181,10 @@ void test_bpf_iter(void) test_bpf_percpu_array_map(); if (test__start_subtest("bpf_sk_storage_map")) test_bpf_sk_storage_map(); + if (test__start_subtest("bpf_sk_storage_delete")) + test_bpf_sk_storage_delete(); + if (test__start_subtest("bpf_sk_storage_get")) + test_bpf_sk_storage_get(); if (test__start_subtest("rdonly-buf-out-of-bound")) test_rdonly_buf_out_of_bound(); if (test__start_subtest("buf-neg-offset")) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 93162484c2ca..8ae97e2a4b9d 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -6652,7 +6652,7 @@ static void do_test_dedup(unsigned int test_num) const void *test_btf_data, *expect_btf_data; const char *ret_test_next_str, *ret_expect_next_str; const char *test_strs, *expect_strs; - const char *test_str_cur, *test_str_end; + const char *test_str_cur; const char *expect_str_cur, *expect_str_end; unsigned int raw_btf_size; void *raw_btf; @@ -6719,12 +6719,18 @@ static void do_test_dedup(unsigned int test_num) goto done; } - test_str_cur = test_strs; - test_str_end = test_strs + test_hdr->str_len; expect_str_cur = expect_strs; expect_str_end = expect_strs + expect_hdr->str_len; - while (test_str_cur < test_str_end && expect_str_cur < expect_str_end) { + while (expect_str_cur < expect_str_end) { size_t test_len, expect_len; + int off; + + off = btf__find_str(test_btf, expect_str_cur); + if (CHECK(off < 0, "exp str '%s' not found: %d\n", expect_str_cur, off)) { + err = -1; + goto done; + } + test_str_cur = btf__str_by_offset(test_btf, off); test_len = strlen(test_str_cur); expect_len = strlen(expect_str_cur); @@ -6741,15 +6747,8 @@ static void do_test_dedup(unsigned int test_num) err = -1; goto done; } - test_str_cur += test_len + 1; expect_str_cur += expect_len + 1; } - if (CHECK(test_str_cur != test_str_end, - "test_str_cur:%p != test_str_end:%p", - test_str_cur, test_str_end)) { - err = -1; - goto done; - } test_nr_types = btf__get_nr_types(test_btf); expect_nr_types = btf__get_nr_types(expect_btf); @@ -6775,10 +6774,21 @@ static void do_test_dedup(unsigned int test_num) err = -1; goto done; } - if (CHECK(memcmp((void *)test_type, - (void *)expect_type, - test_size), - "type #%d: contents differ", i)) { + if (CHECK(btf_kind(test_type) != btf_kind(expect_type), + "type %d kind: exp %d != got %u\n", + i, btf_kind(expect_type), btf_kind(test_type))) { + err = -1; + goto done; + } + if (CHECK(test_type->info != expect_type->info, + "type %d info: exp %d != got %u\n", + i, expect_type->info, test_type->info)) { + err = -1; + goto done; + } + if (CHECK(test_type->size != expect_type->size, + "type %d size/type: exp %d != got %u\n", + i, expect_type->size, test_type->size)) { err = -1; goto done; } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c new file mode 100644 index 000000000000..64554fd33547 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <test_progs.h> +#include <bpf/btf.h> +#include "btf_helpers.h" + +static void test_split_simple() { + const struct btf_type *t; + struct btf *btf1, *btf2; + int str_off, err; + + btf1 = btf__new_empty(); + if (!ASSERT_OK_PTR(btf1, "empty_main_btf")) + return; + + btf__set_pointer_size(btf1, 8); /* enforce 64-bit arch */ + + btf__add_int(btf1, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf1, 1); /* [2] ptr to int */ + btf__add_struct(btf1, "s1", 4); /* [3] struct s1 { */ + btf__add_field(btf1, "f1", 1, 0, 0); /* int f1; */ + /* } */ + + VALIDATE_RAW_BTF( + btf1, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0"); + + ASSERT_STREQ(btf_type_c_dump(btf1), "\ +struct s1 {\n\ + int f1;\n\ +};\n\n", "c_dump"); + + btf2 = btf__new_empty_split(btf1); + if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) + goto cleanup; + + /* pointer size should be "inherited" from main BTF */ + ASSERT_EQ(btf__pointer_size(btf2), 8, "inherit_ptr_sz"); + + str_off = btf__find_str(btf2, "int"); + ASSERT_NEQ(str_off, -ENOENT, "str_int_missing"); + + t = btf__type_by_id(btf2, 1); + if (!ASSERT_OK_PTR(t, "int_type")) + goto cleanup; + ASSERT_EQ(btf_is_int(t), true, "int_kind"); + ASSERT_STREQ(btf__str_by_offset(btf2, t->name_off), "int", "int_name"); + + btf__add_struct(btf2, "s2", 16); /* [4] struct s2 { */ + btf__add_field(btf2, "f1", 6, 0, 0); /* struct s1 f1; */ + btf__add_field(btf2, "f2", 5, 32, 0); /* int f2; */ + btf__add_field(btf2, "f3", 2, 64, 0); /* int *f3; */ + /* } */ + + /* duplicated int */ + btf__add_int(btf2, "int", 4, BTF_INT_SIGNED); /* [5] int */ + + /* duplicated struct s1 */ + btf__add_struct(btf2, "s1", 4); /* [6] struct s1 { */ + btf__add_field(btf2, "f1", 5, 0, 0); /* int f1; */ + /* } */ + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=16 vlen=3\n" + "\t'f1' type_id=6 bits_offset=0\n" + "\t'f2' type_id=5 bits_offset=32\n" + "\t'f3' type_id=2 bits_offset=64", + "[5] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[6] STRUCT 's1' size=4 vlen=1\n" + "\t'f1' type_id=5 bits_offset=0"); + + ASSERT_STREQ(btf_type_c_dump(btf2), "\ +struct s1 {\n\ + int f1;\n\ +};\n\ +\n\ +struct s1___2 {\n\ + int f1;\n\ +};\n\ +\n\ +struct s2 {\n\ + struct s1___2 f1;\n\ + int f2;\n\ + int *f3;\n\ +};\n\n", "c_dump"); + + err = btf__dedup(btf2, NULL, NULL); + if (!ASSERT_OK(err, "btf_dedup")) + goto cleanup; + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=16 vlen=3\n" + "\t'f1' type_id=3 bits_offset=0\n" + "\t'f2' type_id=1 bits_offset=32\n" + "\t'f3' type_id=2 bits_offset=64"); + + ASSERT_STREQ(btf_type_c_dump(btf2), "\ +struct s1 {\n\ + int f1;\n\ +};\n\ +\n\ +struct s2 {\n\ + struct s1 f1;\n\ + int f2;\n\ + int *f3;\n\ +};\n\n", "c_dump"); + +cleanup: + btf__free(btf2); + btf__free(btf1); +} + +static void test_split_fwd_resolve() { + struct btf *btf1, *btf2; + int err; + + btf1 = btf__new_empty(); + if (!ASSERT_OK_PTR(btf1, "empty_main_btf")) + return; + + btf__set_pointer_size(btf1, 8); /* enforce 64-bit arch */ + + btf__add_int(btf1, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf1, 4); /* [2] ptr to struct s1 */ + btf__add_ptr(btf1, 5); /* [3] ptr to struct s2 */ + btf__add_struct(btf1, "s1", 16); /* [4] struct s1 { */ + btf__add_field(btf1, "f1", 2, 0, 0); /* struct s1 *f1; */ + btf__add_field(btf1, "f2", 3, 64, 0); /* struct s2 *f2; */ + /* } */ + btf__add_struct(btf1, "s2", 4); /* [5] struct s2 { */ + btf__add_field(btf1, "f1", 1, 0, 0); /* int f1; */ + /* } */ + + VALIDATE_RAW_BTF( + btf1, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=4", + "[3] PTR '(anon)' type_id=5", + "[4] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=3 bits_offset=64", + "[5] STRUCT 's2' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0"); + + btf2 = btf__new_empty_split(btf1); + if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) + goto cleanup; + + btf__add_int(btf2, "int", 4, BTF_INT_SIGNED); /* [6] int */ + btf__add_ptr(btf2, 10); /* [7] ptr to struct s1 */ + btf__add_fwd(btf2, "s2", BTF_FWD_STRUCT); /* [8] fwd for struct s2 */ + btf__add_ptr(btf2, 8); /* [9] ptr to fwd struct s2 */ + btf__add_struct(btf2, "s1", 16); /* [10] struct s1 { */ + btf__add_field(btf2, "f1", 7, 0, 0); /* struct s1 *f1; */ + btf__add_field(btf2, "f2", 9, 64, 0); /* struct s2 *f2; */ + /* } */ + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=4", + "[3] PTR '(anon)' type_id=5", + "[4] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=3 bits_offset=64", + "[5] STRUCT 's2' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0", + "[6] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[7] PTR '(anon)' type_id=10", + "[8] FWD 's2' fwd_kind=struct", + "[9] PTR '(anon)' type_id=8", + "[10] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=7 bits_offset=0\n" + "\t'f2' type_id=9 bits_offset=64"); + + err = btf__dedup(btf2, NULL, NULL); + if (!ASSERT_OK(err, "btf_dedup")) + goto cleanup; + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=4", + "[3] PTR '(anon)' type_id=5", + "[4] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=3 bits_offset=64", + "[5] STRUCT 's2' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0"); + +cleanup: + btf__free(btf2); + btf__free(btf1); +} + +static void test_split_struct_duped() { + struct btf *btf1, *btf2; + int err; + + btf1 = btf__new_empty(); + if (!ASSERT_OK_PTR(btf1, "empty_main_btf")) + return; + + btf__set_pointer_size(btf1, 8); /* enforce 64-bit arch */ + + btf__add_int(btf1, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf1, 5); /* [2] ptr to struct s1 */ + btf__add_fwd(btf1, "s2", BTF_FWD_STRUCT); /* [3] fwd for struct s2 */ + btf__add_ptr(btf1, 3); /* [4] ptr to fwd struct s2 */ + btf__add_struct(btf1, "s1", 16); /* [5] struct s1 { */ + btf__add_field(btf1, "f1", 2, 0, 0); /* struct s1 *f1; */ + btf__add_field(btf1, "f2", 4, 64, 0); /* struct s2 *f2; */ + /* } */ + + VALIDATE_RAW_BTF( + btf1, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=5", + "[3] FWD 's2' fwd_kind=struct", + "[4] PTR '(anon)' type_id=3", + "[5] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=4 bits_offset=64"); + + btf2 = btf__new_empty_split(btf1); + if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) + goto cleanup; + + btf__add_int(btf2, "int", 4, BTF_INT_SIGNED); /* [6] int */ + btf__add_ptr(btf2, 10); /* [7] ptr to struct s1 */ + btf__add_fwd(btf2, "s2", BTF_FWD_STRUCT); /* [8] fwd for struct s2 */ + btf__add_ptr(btf2, 11); /* [9] ptr to struct s2 */ + btf__add_struct(btf2, "s1", 16); /* [10] struct s1 { */ + btf__add_field(btf2, "f1", 7, 0, 0); /* struct s1 *f1; */ + btf__add_field(btf2, "f2", 9, 64, 0); /* struct s2 *f2; */ + /* } */ + btf__add_struct(btf2, "s2", 40); /* [11] struct s2 { */ + btf__add_field(btf2, "f1", 7, 0, 0); /* struct s1 *f1; */ + btf__add_field(btf2, "f2", 9, 64, 0); /* struct s2 *f2; */ + btf__add_field(btf2, "f3", 6, 128, 0); /* int f3; */ + btf__add_field(btf2, "f4", 10, 192, 0); /* struct s1 f4; */ + /* } */ + btf__add_ptr(btf2, 8); /* [12] ptr to fwd struct s2 */ + btf__add_struct(btf2, "s3", 8); /* [13] struct s3 { */ + btf__add_field(btf2, "f1", 12, 0, 0); /* struct s2 *f1; (fwd) */ + /* } */ + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=5", + "[3] FWD 's2' fwd_kind=struct", + "[4] PTR '(anon)' type_id=3", + "[5] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=4 bits_offset=64", + "[6] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[7] PTR '(anon)' type_id=10", + "[8] FWD 's2' fwd_kind=struct", + "[9] PTR '(anon)' type_id=11", + "[10] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=7 bits_offset=0\n" + "\t'f2' type_id=9 bits_offset=64", + "[11] STRUCT 's2' size=40 vlen=4\n" + "\t'f1' type_id=7 bits_offset=0\n" + "\t'f2' type_id=9 bits_offset=64\n" + "\t'f3' type_id=6 bits_offset=128\n" + "\t'f4' type_id=10 bits_offset=192", + "[12] PTR '(anon)' type_id=8", + "[13] STRUCT 's3' size=8 vlen=1\n" + "\t'f1' type_id=12 bits_offset=0"); + + err = btf__dedup(btf2, NULL, NULL); + if (!ASSERT_OK(err, "btf_dedup")) + goto cleanup; + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=5", + "[3] FWD 's2' fwd_kind=struct", + "[4] PTR '(anon)' type_id=3", + "[5] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=4 bits_offset=64", + "[6] PTR '(anon)' type_id=8", + "[7] PTR '(anon)' type_id=9", + "[8] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=6 bits_offset=0\n" + "\t'f2' type_id=7 bits_offset=64", + "[9] STRUCT 's2' size=40 vlen=4\n" + "\t'f1' type_id=6 bits_offset=0\n" + "\t'f2' type_id=7 bits_offset=64\n" + "\t'f3' type_id=1 bits_offset=128\n" + "\t'f4' type_id=8 bits_offset=192", + "[10] STRUCT 's3' size=8 vlen=1\n" + "\t'f1' type_id=7 bits_offset=0"); + +cleanup: + btf__free(btf2); + btf__free(btf1); +} + +void test_btf_dedup_split() +{ + if (test__start_subtest("split_simple")) + test_split_simple(); + if (test__start_subtest("split_struct_duped")) + test_split_struct_duped(); + if (test__start_subtest("split_fwd_resolve")) + test_split_fwd_resolve(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c index 86ccf37e26b3..762f6a9da8b5 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c @@ -17,7 +17,7 @@ #include "test_btf_skc_cls_ingress.skel.h" static struct test_btf_skc_cls_ingress *skel; -struct sockaddr_in6 srv_sa6; +static struct sockaddr_in6 srv_sa6; static __u32 duration; #define PROG_PIN_FILE "/sys/fs/bpf/btf_skc_cls_ingress" diff --git a/tools/testing/selftests/bpf/prog_tests/btf_split.c b/tools/testing/selftests/bpf/prog_tests/btf_split.c new file mode 100644 index 000000000000..ca7c2a91610a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_split.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <test_progs.h> +#include <bpf/btf.h> + +static char *dump_buf; +static size_t dump_buf_sz; +static FILE *dump_buf_file; + +static void btf_dump_printf(void *ctx, const char *fmt, va_list args) +{ + vfprintf(ctx, fmt, args); +} + +void test_btf_split() { + struct btf_dump_opts opts; + struct btf_dump *d = NULL; + const struct btf_type *t; + struct btf *btf1, *btf2; + int str_off, i, err; + + btf1 = btf__new_empty(); + if (!ASSERT_OK_PTR(btf1, "empty_main_btf")) + return; + + btf__set_pointer_size(btf1, 8); /* enforce 64-bit arch */ + + btf__add_int(btf1, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf1, 1); /* [2] ptr to int */ + + btf__add_struct(btf1, "s1", 4); /* [3] struct s1 { */ + btf__add_field(btf1, "f1", 1, 0, 0); /* int f1; */ + /* } */ + + btf2 = btf__new_empty_split(btf1); + if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) + goto cleanup; + + /* pointer size should be "inherited" from main BTF */ + ASSERT_EQ(btf__pointer_size(btf2), 8, "inherit_ptr_sz"); + + str_off = btf__find_str(btf2, "int"); + ASSERT_NEQ(str_off, -ENOENT, "str_int_missing"); + + t = btf__type_by_id(btf2, 1); + if (!ASSERT_OK_PTR(t, "int_type")) + goto cleanup; + ASSERT_EQ(btf_is_int(t), true, "int_kind"); + ASSERT_STREQ(btf__str_by_offset(btf2, t->name_off), "int", "int_name"); + + btf__add_struct(btf2, "s2", 16); /* [4] struct s2 { */ + btf__add_field(btf2, "f1", 3, 0, 0); /* struct s1 f1; */ + btf__add_field(btf2, "f2", 1, 32, 0); /* int f2; */ + btf__add_field(btf2, "f3", 2, 64, 0); /* int *f3; */ + /* } */ + + t = btf__type_by_id(btf1, 4); + ASSERT_NULL(t, "split_type_in_main"); + + t = btf__type_by_id(btf2, 4); + if (!ASSERT_OK_PTR(t, "split_struct_type")) + goto cleanup; + ASSERT_EQ(btf_is_struct(t), true, "split_struct_kind"); + ASSERT_EQ(btf_vlen(t), 3, "split_struct_vlen"); + ASSERT_STREQ(btf__str_by_offset(btf2, t->name_off), "s2", "split_struct_name"); + + /* BTF-to-C dump of split BTF */ + dump_buf_file = open_memstream(&dump_buf, &dump_buf_sz); + if (!ASSERT_OK_PTR(dump_buf_file, "dump_memstream")) + return; + opts.ctx = dump_buf_file; + d = btf_dump__new(btf2, NULL, &opts, btf_dump_printf); + if (!ASSERT_OK_PTR(d, "btf_dump__new")) + goto cleanup; + for (i = 1; i <= btf__get_nr_types(btf2); i++) { + err = btf_dump__dump_type(d, i); + ASSERT_OK(err, "dump_type_ok"); + } + fflush(dump_buf_file); + dump_buf[dump_buf_sz] = 0; /* some libc implementations don't do this */ + ASSERT_STREQ(dump_buf, +"struct s1 {\n" +" int f1;\n" +"};\n" +"\n" +"struct s2 {\n" +" struct s1 f1;\n" +" int f2;\n" +" int *f3;\n" +"};\n\n", "c_dump"); + +cleanup: + if (dump_buf_file) + fclose(dump_buf_file); + free(dump_buf); + btf_dump__free(d); + btf__free(btf1); + btf__free(btf2); +} diff --git a/tools/testing/selftests/bpf/prog_tests/btf_write.c b/tools/testing/selftests/bpf/prog_tests/btf_write.c index 314e1e7c36df..f36da15b134f 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_write.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_write.c @@ -2,6 +2,7 @@ /* Copyright (c) 2020 Facebook */ #include <test_progs.h> #include <bpf/btf.h> +#include "btf_helpers.h" static int duration = 0; @@ -39,6 +40,8 @@ void test_btf_write() { ASSERT_EQ(t->size, 4, "int_sz"); ASSERT_EQ(btf_int_encoding(t), BTF_INT_SIGNED, "int_enc"); ASSERT_EQ(btf_int_bits(t), 32, "int_bits"); + ASSERT_STREQ(btf_type_raw_dump(btf, 1), + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", "raw_dump"); /* invalid int size */ id = btf__add_int(btf, "bad sz int", 7, 0); @@ -59,24 +62,32 @@ void test_btf_write() { t = btf__type_by_id(btf, 2); ASSERT_EQ(btf_kind(t), BTF_KIND_PTR, "ptr_kind"); ASSERT_EQ(t->type, 1, "ptr_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 2), + "[2] PTR '(anon)' type_id=1", "raw_dump"); id = btf__add_const(btf, 5); /* points forward to restrict */ ASSERT_EQ(id, 3, "const_id"); t = btf__type_by_id(btf, 3); ASSERT_EQ(btf_kind(t), BTF_KIND_CONST, "const_kind"); ASSERT_EQ(t->type, 5, "const_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 3), + "[3] CONST '(anon)' type_id=5", "raw_dump"); id = btf__add_volatile(btf, 3); ASSERT_EQ(id, 4, "volatile_id"); t = btf__type_by_id(btf, 4); ASSERT_EQ(btf_kind(t), BTF_KIND_VOLATILE, "volatile_kind"); ASSERT_EQ(t->type, 3, "volatile_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 4), + "[4] VOLATILE '(anon)' type_id=3", "raw_dump"); id = btf__add_restrict(btf, 4); ASSERT_EQ(id, 5, "restrict_id"); t = btf__type_by_id(btf, 5); ASSERT_EQ(btf_kind(t), BTF_KIND_RESTRICT, "restrict_kind"); ASSERT_EQ(t->type, 4, "restrict_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 5), + "[5] RESTRICT '(anon)' type_id=4", "raw_dump"); /* ARRAY */ id = btf__add_array(btf, 1, 2, 10); /* int *[10] */ @@ -86,6 +97,8 @@ void test_btf_write() { ASSERT_EQ(btf_array(t)->index_type, 1, "array_index_type"); ASSERT_EQ(btf_array(t)->type, 2, "array_elem_type"); ASSERT_EQ(btf_array(t)->nelems, 10, "array_nelems"); + ASSERT_STREQ(btf_type_raw_dump(btf, 6), + "[6] ARRAY '(anon)' type_id=2 index_type_id=1 nr_elems=10", "raw_dump"); /* STRUCT */ err = btf__add_field(btf, "field", 1, 0, 0); @@ -113,6 +126,10 @@ void test_btf_write() { ASSERT_EQ(m->type, 1, "f2_type"); ASSERT_EQ(btf_member_bit_offset(t, 1), 32, "f2_bit_off"); ASSERT_EQ(btf_member_bitfield_size(t, 1), 16, "f2_bit_sz"); + ASSERT_STREQ(btf_type_raw_dump(btf, 7), + "[7] STRUCT 's1' size=8 vlen=2\n" + "\t'f1' type_id=1 bits_offset=0\n" + "\t'f2' type_id=1 bits_offset=32 bitfield_size=16", "raw_dump"); /* UNION */ id = btf__add_union(btf, "u1", 8); @@ -136,6 +153,9 @@ void test_btf_write() { ASSERT_EQ(m->type, 1, "f1_type"); ASSERT_EQ(btf_member_bit_offset(t, 0), 0, "f1_bit_off"); ASSERT_EQ(btf_member_bitfield_size(t, 0), 16, "f1_bit_sz"); + ASSERT_STREQ(btf_type_raw_dump(btf, 8), + "[8] UNION 'u1' size=8 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0 bitfield_size=16", "raw_dump"); /* ENUM */ id = btf__add_enum(btf, "e1", 4); @@ -156,6 +176,10 @@ void test_btf_write() { v = btf_enum(t) + 1; ASSERT_STREQ(btf__str_by_offset(btf, v->name_off), "v2", "v2_name"); ASSERT_EQ(v->val, 2, "v2_val"); + ASSERT_STREQ(btf_type_raw_dump(btf, 9), + "[9] ENUM 'e1' size=4 vlen=2\n" + "\t'v1' val=1\n" + "\t'v2' val=2", "raw_dump"); /* FWDs */ id = btf__add_fwd(btf, "struct_fwd", BTF_FWD_STRUCT); @@ -164,6 +188,8 @@ void test_btf_write() { ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "struct_fwd", "fwd_name"); ASSERT_EQ(btf_kind(t), BTF_KIND_FWD, "fwd_kind"); ASSERT_EQ(btf_kflag(t), 0, "fwd_kflag"); + ASSERT_STREQ(btf_type_raw_dump(btf, 10), + "[10] FWD 'struct_fwd' fwd_kind=struct", "raw_dump"); id = btf__add_fwd(btf, "union_fwd", BTF_FWD_UNION); ASSERT_EQ(id, 11, "union_fwd_id"); @@ -171,6 +197,8 @@ void test_btf_write() { ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "union_fwd", "fwd_name"); ASSERT_EQ(btf_kind(t), BTF_KIND_FWD, "fwd_kind"); ASSERT_EQ(btf_kflag(t), 1, "fwd_kflag"); + ASSERT_STREQ(btf_type_raw_dump(btf, 11), + "[11] FWD 'union_fwd' fwd_kind=union", "raw_dump"); id = btf__add_fwd(btf, "enum_fwd", BTF_FWD_ENUM); ASSERT_EQ(id, 12, "enum_fwd_id"); @@ -179,6 +207,8 @@ void test_btf_write() { ASSERT_EQ(btf_kind(t), BTF_KIND_ENUM, "enum_fwd_kind"); ASSERT_EQ(btf_vlen(t), 0, "enum_fwd_kind"); ASSERT_EQ(t->size, 4, "enum_fwd_sz"); + ASSERT_STREQ(btf_type_raw_dump(btf, 12), + "[12] ENUM 'enum_fwd' size=4 vlen=0", "raw_dump"); /* TYPEDEF */ id = btf__add_typedef(btf, "typedef1", 1); @@ -187,6 +217,8 @@ void test_btf_write() { ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "typedef1", "typedef_name"); ASSERT_EQ(btf_kind(t), BTF_KIND_TYPEDEF, "typedef_kind"); ASSERT_EQ(t->type, 1, "typedef_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 13), + "[13] TYPEDEF 'typedef1' type_id=1", "raw_dump"); /* FUNC & FUNC_PROTO */ id = btf__add_func(btf, "func1", BTF_FUNC_GLOBAL, 15); @@ -196,6 +228,8 @@ void test_btf_write() { ASSERT_EQ(t->type, 15, "func_type"); ASSERT_EQ(btf_kind(t), BTF_KIND_FUNC, "func_kind"); ASSERT_EQ(btf_vlen(t), BTF_FUNC_GLOBAL, "func_vlen"); + ASSERT_STREQ(btf_type_raw_dump(btf, 14), + "[14] FUNC 'func1' type_id=15 linkage=global", "raw_dump"); id = btf__add_func_proto(btf, 1); ASSERT_EQ(id, 15, "func_proto_id"); @@ -214,6 +248,10 @@ void test_btf_write() { p = btf_params(t) + 1; ASSERT_STREQ(btf__str_by_offset(btf, p->name_off), "p2", "p2_name"); ASSERT_EQ(p->type, 2, "p2_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 15), + "[15] FUNC_PROTO '(anon)' ret_type_id=1 vlen=2\n" + "\t'p1' type_id=1\n" + "\t'p2' type_id=2", "raw_dump"); /* VAR */ id = btf__add_var(btf, "var1", BTF_VAR_GLOBAL_ALLOCATED, 1); @@ -223,6 +261,8 @@ void test_btf_write() { ASSERT_EQ(btf_kind(t), BTF_KIND_VAR, "var_kind"); ASSERT_EQ(t->type, 1, "var_type"); ASSERT_EQ(btf_var(t)->linkage, BTF_VAR_GLOBAL_ALLOCATED, "var_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 16), + "[16] VAR 'var1' type_id=1, linkage=global-alloc", "raw_dump"); /* DATASECT */ id = btf__add_datasec(btf, "datasec1", 12); @@ -239,6 +279,9 @@ void test_btf_write() { ASSERT_EQ(vi->type, 1, "v1_type"); ASSERT_EQ(vi->offset, 4, "v1_off"); ASSERT_EQ(vi->size, 8, "v1_sz"); + ASSERT_STREQ(btf_type_raw_dump(btf, 17), + "[17] DATASEC 'datasec1' size=12 vlen=1\n" + "\ttype_id=1 offset=4 size=8", "raw_dump"); btf__free(btf); } diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 30e40ff4b0d8..06eb956ff7bb 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> #include "progs/core_reloc_types.h" +#include "bpf_testmod/bpf_testmod.h" #include <sys/mman.h> #include <sys/syscall.h> #include <bpf/btf.h> @@ -9,6 +10,30 @@ static int duration = 0; #define STRUCT_TO_CHAR_PTR(struct_name) (const char *)&(struct struct_name) +#define MODULES_CASE(name, sec_name, tp_name) { \ + .case_name = name, \ + .bpf_obj_file = "test_core_reloc_module.o", \ + .btf_src_file = NULL, /* find in kernel module BTFs */ \ + .input = "", \ + .input_len = 0, \ + .output = STRUCT_TO_CHAR_PTR(core_reloc_module_output) { \ + .read_ctx_sz = sizeof(struct bpf_testmod_test_read_ctx),\ + .read_ctx_exists = true, \ + .buf_exists = true, \ + .len_exists = true, \ + .off_exists = true, \ + .len = 123, \ + .off = 0, \ + .comm = "test_progs", \ + .comm_len = sizeof("test_progs"), \ + }, \ + .output_len = sizeof(struct core_reloc_module_output), \ + .prog_sec_name = sec_name, \ + .raw_tp_name = tp_name, \ + .trigger = trigger_module_test_read, \ + .needs_testmod = true, \ +} + #define FLAVORS_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) { \ .a = 42, \ .b = 0xc001, \ @@ -211,7 +236,7 @@ static int duration = 0; .output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output) \ __VA_ARGS__, \ .output_len = sizeof(struct core_reloc_bitfields_output), \ - .direct_raw_tp = true, \ + .prog_sec_name = "tp_btf/sys_enter", \ } @@ -222,7 +247,7 @@ static int duration = 0; }, { \ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \ "direct:", name), \ - .direct_raw_tp = true, \ + .prog_sec_name = "tp_btf/sys_enter", \ .fails = true, \ } @@ -309,6 +334,7 @@ static int duration = 0; struct core_reloc_test_case; typedef int (*setup_test_fn)(struct core_reloc_test_case *test); +typedef int (*trigger_test_fn)(const struct core_reloc_test_case *test); struct core_reloc_test_case { const char *case_name; @@ -319,9 +345,12 @@ struct core_reloc_test_case { const char *output; int output_len; bool fails; + bool needs_testmod; bool relaxed_core_relocs; - bool direct_raw_tp; + const char *prog_sec_name; + const char *raw_tp_name; setup_test_fn setup; + trigger_test_fn trigger; }; static int find_btf_type(const struct btf *btf, const char *name, __u32 kind) @@ -451,6 +480,23 @@ static int setup_type_id_case_failure(struct core_reloc_test_case *test) return 0; } +static int trigger_module_test_read(const struct core_reloc_test_case *test) +{ + struct core_reloc_module_output *exp = (void *)test->output; + int fd, err; + + fd = open("/sys/kernel/bpf_testmod", O_RDONLY); + err = -errno; + if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", err)) + return err; + + read(fd, NULL, exp->len); /* request expected number of bytes */ + close(fd); + + return 0; +} + + static struct core_reloc_test_case test_cases[] = { /* validate we can find kernel image and use its BTF for relocs */ { @@ -467,6 +513,10 @@ static struct core_reloc_test_case test_cases[] = { .output_len = sizeof(struct core_reloc_kernel_output), }, + /* validate we can find kernel module BTF types for relocs/attach */ + MODULES_CASE("module_probed", "raw_tp/bpf_testmod_test_read", "bpf_testmod_test_read"), + MODULES_CASE("module_direct", "tp_btf/bpf_testmod_test_read", NULL), + /* validate BPF program can use multiple flavors to match against * single target BTF type */ @@ -779,6 +829,11 @@ void test_core_reloc(void) if (!test__start_subtest(test_case->case_name)) continue; + if (test_case->needs_testmod && !env.has_testmod) { + test__skip(); + continue; + } + if (test_case->setup) { err = test_case->setup(test_case); if (CHECK(err, "test_setup", "test #%d setup failed: %d\n", i, err)) @@ -790,13 +845,11 @@ void test_core_reloc(void) test_case->bpf_obj_file, PTR_ERR(obj))) continue; - /* for typed raw tracepoints, NULL should be specified */ - if (test_case->direct_raw_tp) { - probe_name = "tp_btf/sys_enter"; - tp_name = NULL; - } else { - probe_name = "raw_tracepoint/sys_enter"; - tp_name = "sys_enter"; + probe_name = "raw_tracepoint/sys_enter"; + tp_name = "sys_enter"; + if (test_case->prog_sec_name) { + probe_name = test_case->prog_sec_name; + tp_name = test_case->raw_tp_name; /* NULL for tp_btf */ } prog = bpf_object__find_program_by_title(obj, probe_name); @@ -837,7 +890,12 @@ void test_core_reloc(void) goto cleanup; /* trigger test run */ - usleep(1); + if (test_case->trigger) { + if (!ASSERT_OK(test_case->trigger(test_case), "test_trigger")) + goto cleanup; + } else { + usleep(1); + } if (data->skip) { test__skip(); diff --git a/tools/testing/selftests/bpf/prog_tests/hash_large_key.c b/tools/testing/selftests/bpf/prog_tests/hash_large_key.c new file mode 100644 index 000000000000..34684c0fc76d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/hash_large_key.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "test_hash_large_key.skel.h" + +void test_hash_large_key(void) +{ + int err, value = 21, duration = 0, hash_map_fd; + struct test_hash_large_key *skel; + + struct bigelement { + int a; + char b[4096]; + long long c; + } key; + bzero(&key, sizeof(key)); + + skel = test_hash_large_key__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + return; + + hash_map_fd = bpf_map__fd(skel->maps.hash_map); + if (CHECK(hash_map_fd < 0, "bpf_map__fd", "failed\n")) + goto cleanup; + + err = test_hash_large_key__attach(skel); + if (CHECK(err, "attach_raw_tp", "err %d\n", err)) + goto cleanup; + + err = bpf_map_update_elem(hash_map_fd, &key, &value, BPF_ANY); + if (CHECK(err, "bpf_map_update_elem", "errno=%d\n", errno)) + goto cleanup; + + key.c = 1; + err = bpf_map_lookup_elem(hash_map_fd, &key, &value); + if (CHECK(err, "bpf_map_lookup_elem", "errno=%d\n", errno)) + goto cleanup; + + CHECK_FAIL(value != 42); + +cleanup: + test_hash_large_key__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c new file mode 100644 index 000000000000..50796b651f72 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <test_progs.h> +#include "test_module_attach.skel.h" + +static int duration; + +static int trigger_module_test_read(int read_sz) +{ + int fd, err; + + fd = open("/sys/kernel/bpf_testmod", O_RDONLY); + err = -errno; + if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", err)) + return err; + + read(fd, NULL, read_sz); + close(fd); + + return 0; +} + +void test_module_attach(void) +{ + const int READ_SZ = 456; + struct test_module_attach* skel; + struct test_module_attach__bss *bss; + int err; + + skel = test_module_attach__open(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + + err = bpf_program__set_attach_target(skel->progs.handle_fentry_manual, + 0, "bpf_testmod_test_read"); + ASSERT_OK(err, "set_attach_target"); + + err = test_module_attach__load(skel); + if (CHECK(err, "skel_load", "failed to load skeleton\n")) + return; + + bss = skel->bss; + + err = test_module_attach__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* trigger tracepoint */ + ASSERT_OK(trigger_module_test_read(READ_SZ), "trigger_read"); + + ASSERT_EQ(bss->raw_tp_read_sz, READ_SZ, "raw_tp"); + ASSERT_EQ(bss->tp_btf_read_sz, READ_SZ, "tp_btf"); + ASSERT_EQ(bss->fentry_read_sz, READ_SZ, "fentry"); + ASSERT_EQ(bss->fentry_manual_read_sz, READ_SZ, "fentry_manual"); + ASSERT_EQ(bss->fexit_read_sz, READ_SZ, "fexit"); + ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet"); + ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret"); + +cleanup: + test_module_attach__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index c1650548433c..fddbc5db5d6a 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -217,9 +217,15 @@ void test_ringbuf(void) if (CHECK(err, "join_bg", "err %d\n", err)) goto cleanup; - if (CHECK(bg_ret != 1, "bg_ret", "epoll_wait result: %ld", bg_ret)) + if (CHECK(bg_ret <= 0, "bg_ret", "epoll_wait result: %ld", bg_ret)) goto cleanup; + /* due to timing variations, there could still be non-notified + * samples, so consume them here to collect all the samples + */ + err = ring_buffer__consume(ringbuf); + CHECK(err < 0, "rb_consume", "failed: %d\b", err); + /* 3 rounds, 2 samples each */ cnt = atomic_xchg(&sample_cnt, 0); CHECK(cnt != 6, "cnt", "exp %d samples, got %d\n", 6, cnt); diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index 78e450609803..d37161e59bb2 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -81,7 +81,7 @@ void test_ringbuf_multi(void) /* poll for samples, should get 2 ringbufs back */ err = ring_buffer__poll(ringbuf, -1); - if (CHECK(err != 4, "poll_res", "expected 4 records, got %d\n", err)) + if (CHECK(err != 2, "poll_res", "expected 2 records, got %d\n", err)) goto cleanup; /* expect extra polling to return nothing */ diff --git a/tools/testing/selftests/bpf/prog_tests/sk_storage_tracing.c b/tools/testing/selftests/bpf/prog_tests/sk_storage_tracing.c new file mode 100644 index 000000000000..2b392590e8ca --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sk_storage_tracing.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <sys/types.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "test_progs.h" +#include "network_helpers.h" +#include "test_sk_storage_trace_itself.skel.h" +#include "test_sk_storage_tracing.skel.h" + +#define LO_ADDR6 "::1" +#define TEST_COMM "test_progs" + +struct sk_stg { + __u32 pid; + __u32 last_notclose_state; + char comm[16]; +}; + +static struct test_sk_storage_tracing *skel; +static __u32 duration; +static pid_t my_pid; + +static int check_sk_stg(int sk_fd, __u32 expected_state) +{ + struct sk_stg sk_stg; + int err; + + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sk_stg_map), &sk_fd, + &sk_stg); + if (!ASSERT_OK(err, "map_lookup(sk_stg_map)")) + return -1; + + if (!ASSERT_EQ(sk_stg.last_notclose_state, expected_state, + "last_notclose_state")) + return -1; + + if (!ASSERT_EQ(sk_stg.pid, my_pid, "pid")) + return -1; + + if (!ASSERT_STREQ(sk_stg.comm, skel->bss->task_comm, "task_comm")) + return -1; + + return 0; +} + +static void do_test(void) +{ + int listen_fd = -1, passive_fd = -1, active_fd = -1, value = 1, err; + char abyte; + + listen_fd = start_server(AF_INET6, SOCK_STREAM, LO_ADDR6, 0, 0); + if (CHECK(listen_fd == -1, "start_server", + "listen_fd:%d errno:%d\n", listen_fd, errno)) + return; + + active_fd = connect_to_fd(listen_fd, 0); + if (CHECK(active_fd == -1, "connect_to_fd", "active_fd:%d errno:%d\n", + active_fd, errno)) + goto out; + + err = bpf_map_update_elem(bpf_map__fd(skel->maps.del_sk_stg_map), + &active_fd, &value, 0); + if (!ASSERT_OK(err, "map_update(del_sk_stg_map)")) + goto out; + + passive_fd = accept(listen_fd, NULL, 0); + if (CHECK(passive_fd == -1, "accept", "passive_fd:%d errno:%d\n", + passive_fd, errno)) + goto out; + + shutdown(active_fd, SHUT_WR); + err = read(passive_fd, &abyte, 1); + if (!ASSERT_OK(err, "read(passive_fd)")) + goto out; + + shutdown(passive_fd, SHUT_WR); + err = read(active_fd, &abyte, 1); + if (!ASSERT_OK(err, "read(active_fd)")) + goto out; + + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.del_sk_stg_map), + &active_fd, &value); + if (!ASSERT_ERR(err, "map_lookup(del_sk_stg_map)")) + goto out; + + err = check_sk_stg(listen_fd, BPF_TCP_LISTEN); + if (!ASSERT_OK(err, "listen_fd sk_stg")) + goto out; + + err = check_sk_stg(active_fd, BPF_TCP_FIN_WAIT2); + if (!ASSERT_OK(err, "active_fd sk_stg")) + goto out; + + err = check_sk_stg(passive_fd, BPF_TCP_LAST_ACK); + ASSERT_OK(err, "passive_fd sk_stg"); + +out: + if (active_fd != -1) + close(active_fd); + if (passive_fd != -1) + close(passive_fd); + if (listen_fd != -1) + close(listen_fd); +} + +void test_sk_storage_tracing(void) +{ + struct test_sk_storage_trace_itself *skel_itself; + int err; + + my_pid = getpid(); + + skel_itself = test_sk_storage_trace_itself__open_and_load(); + + if (!ASSERT_NULL(skel_itself, "test_sk_storage_trace_itself")) { + test_sk_storage_trace_itself__destroy(skel_itself); + return; + } + + skel = test_sk_storage_tracing__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_sk_storage_tracing")) + return; + + err = test_sk_storage_tracing__attach(skel); + if (!ASSERT_OK(err, "test_sk_storage_tracing__attach")) { + test_sk_storage_tracing__destroy(skel); + return; + } + + do_test(); + + test_sk_storage_tracing__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c index c85174cdcb77..08d19cafd5e8 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c @@ -18,12 +18,12 @@ #define LO_ADDR6 "::1" #define CG_NAME "/tcpbpf-hdr-opt-test" -struct bpf_test_option exp_passive_estab_in; -struct bpf_test_option exp_active_estab_in; -struct bpf_test_option exp_passive_fin_in; -struct bpf_test_option exp_active_fin_in; -struct hdr_stg exp_passive_hdr_stg; -struct hdr_stg exp_active_hdr_stg = { .active = true, }; +static struct bpf_test_option exp_passive_estab_in; +static struct bpf_test_option exp_active_estab_in; +static struct bpf_test_option exp_passive_fin_in; +static struct bpf_test_option exp_active_fin_in; +static struct hdr_stg exp_passive_hdr_stg; +static struct hdr_stg exp_active_hdr_stg = { .active = true, }; static struct test_misc_tcp_hdr_options *misc_skel; static struct test_tcp_hdr_options *skel; diff --git a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c new file mode 100644 index 000000000000..87923d2865b7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <network_helpers.h> + +#include "test_tcpbpf.h" +#include "test_tcpbpf_kern.skel.h" + +#define LO_ADDR6 "::1" +#define CG_NAME "/tcpbpf-user-test" + +static __u32 duration; + +static void verify_result(struct tcpbpf_globals *result) +{ + __u32 expected_events = ((1 << BPF_SOCK_OPS_TIMEOUT_INIT) | + (1 << BPF_SOCK_OPS_RWND_INIT) | + (1 << BPF_SOCK_OPS_TCP_CONNECT_CB) | + (1 << BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB) | + (1 << BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) | + (1 << BPF_SOCK_OPS_NEEDS_ECN) | + (1 << BPF_SOCK_OPS_STATE_CB) | + (1 << BPF_SOCK_OPS_TCP_LISTEN_CB)); + + /* check global map */ + CHECK(expected_events != result->event_map, "event_map", + "unexpected event_map: actual 0x%08x != expected 0x%08x\n", + result->event_map, expected_events); + + ASSERT_EQ(result->bytes_received, 501, "bytes_received"); + ASSERT_EQ(result->bytes_acked, 1002, "bytes_acked"); + ASSERT_EQ(result->data_segs_in, 1, "data_segs_in"); + ASSERT_EQ(result->data_segs_out, 1, "data_segs_out"); + ASSERT_EQ(result->bad_cb_test_rv, 0x80, "bad_cb_test_rv"); + ASSERT_EQ(result->good_cb_test_rv, 0, "good_cb_test_rv"); + ASSERT_EQ(result->num_listen, 1, "num_listen"); + + /* 3 comes from one listening socket + both ends of the connection */ + ASSERT_EQ(result->num_close_events, 3, "num_close_events"); + + /* check setsockopt for SAVE_SYN */ + ASSERT_EQ(result->tcp_save_syn, 0, "tcp_save_syn"); + + /* check getsockopt for SAVED_SYN */ + ASSERT_EQ(result->tcp_saved_syn, 1, "tcp_saved_syn"); + + /* check getsockopt for window_clamp */ + ASSERT_EQ(result->window_clamp_client, 9216, "window_clamp_client"); + ASSERT_EQ(result->window_clamp_server, 9216, "window_clamp_server"); +} + +static void run_test(struct tcpbpf_globals *result) +{ + int listen_fd = -1, cli_fd = -1, accept_fd = -1; + char buf[1000]; + int err = -1; + int i, rv; + + listen_fd = start_server(AF_INET6, SOCK_STREAM, LO_ADDR6, 0, 0); + if (CHECK(listen_fd == -1, "start_server", "listen_fd:%d errno:%d\n", + listen_fd, errno)) + goto done; + + cli_fd = connect_to_fd(listen_fd, 0); + if (CHECK(cli_fd == -1, "connect_to_fd(listen_fd)", + "cli_fd:%d errno:%d\n", cli_fd, errno)) + goto done; + + accept_fd = accept(listen_fd, NULL, NULL); + if (CHECK(accept_fd == -1, "accept(listen_fd)", + "accept_fd:%d errno:%d\n", accept_fd, errno)) + goto done; + + /* Send 1000B of '+'s from cli_fd -> accept_fd */ + for (i = 0; i < 1000; i++) + buf[i] = '+'; + + rv = send(cli_fd, buf, 1000, 0); + if (CHECK(rv != 1000, "send(cli_fd)", "rv:%d errno:%d\n", rv, errno)) + goto done; + + rv = recv(accept_fd, buf, 1000, 0); + if (CHECK(rv != 1000, "recv(accept_fd)", "rv:%d errno:%d\n", rv, errno)) + goto done; + + /* Send 500B of '.'s from accept_fd ->cli_fd */ + for (i = 0; i < 500; i++) + buf[i] = '.'; + + rv = send(accept_fd, buf, 500, 0); + if (CHECK(rv != 500, "send(accept_fd)", "rv:%d errno:%d\n", rv, errno)) + goto done; + + rv = recv(cli_fd, buf, 500, 0); + if (CHECK(rv != 500, "recv(cli_fd)", "rv:%d errno:%d\n", rv, errno)) + goto done; + + /* + * shutdown accept first to guarantee correct ordering for + * bytes_received and bytes_acked when we go to verify the results. + */ + shutdown(accept_fd, SHUT_WR); + err = recv(cli_fd, buf, 1, 0); + if (CHECK(err, "recv(cli_fd) for fin", "err:%d errno:%d\n", err, errno)) + goto done; + + shutdown(cli_fd, SHUT_WR); + err = recv(accept_fd, buf, 1, 0); + CHECK(err, "recv(accept_fd) for fin", "err:%d errno:%d\n", err, errno); +done: + if (accept_fd != -1) + close(accept_fd); + if (cli_fd != -1) + close(cli_fd); + if (listen_fd != -1) + close(listen_fd); + + if (!err) + verify_result(result); +} + +void test_tcpbpf_user(void) +{ + struct test_tcpbpf_kern *skel; + int cg_fd = -1; + + skel = test_tcpbpf_kern__open_and_load(); + if (CHECK(!skel, "open and load skel", "failed")) + return; + + cg_fd = test__join_cgroup(CG_NAME); + if (CHECK(cg_fd < 0, "test__join_cgroup(" CG_NAME ")", + "cg_fd:%d errno:%d", cg_fd, errno)) + goto err; + + skel->links.bpf_testcb = bpf_program__attach_cgroup(skel->progs.bpf_testcb, cg_fd); + if (!ASSERT_OK_PTR(skel->links.bpf_testcb, "attach_cgroup(bpf_testcb)")) + goto err; + + run_test(&skel->bss->global); + +err: + if (cg_fd != -1) + close(cg_fd); + test_tcpbpf_kern__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c new file mode 100644 index 000000000000..2559bb775762 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2020 Google LLC. + */ + +#include <test_progs.h> +#include <linux/limits.h> + +#include "bprm_opts.skel.h" +#include "network_helpers.h" + +#ifndef __NR_pidfd_open +#define __NR_pidfd_open 434 +#endif + +static const char * const bash_envp[] = { "TMPDIR=shouldnotbeset", NULL }; + +static inline int sys_pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int update_storage(int map_fd, int secureexec) +{ + int task_fd, ret = 0; + + task_fd = sys_pidfd_open(getpid(), 0); + if (task_fd < 0) + return errno; + + ret = bpf_map_update_elem(map_fd, &task_fd, &secureexec, BPF_NOEXIST); + if (ret) + ret = errno; + + close(task_fd); + return ret; +} + +static int run_set_secureexec(int map_fd, int secureexec) +{ + int child_pid, child_status, ret, null_fd; + + child_pid = fork(); + if (child_pid == 0) { + null_fd = open("/dev/null", O_WRONLY); + if (null_fd == -1) + exit(errno); + dup2(null_fd, STDOUT_FILENO); + dup2(null_fd, STDERR_FILENO); + close(null_fd); + + /* Ensure that all executions from hereon are + * secure by setting a local storage which is read by + * the bprm_creds_for_exec hook and sets bprm->secureexec. + */ + ret = update_storage(map_fd, secureexec); + if (ret) + exit(ret); + + /* If the binary is executed with securexec=1, the dynamic + * loader ingores and unsets certain variables like LD_PRELOAD, + * TMPDIR etc. TMPDIR is used here to simplify the example, as + * LD_PRELOAD requires a real .so file. + * + * If the value of TMPDIR is set, the bash command returns 10 + * and if the value is unset, it returns 20. + */ + execle("/bin/bash", "bash", "-c", + "[[ -z \"${TMPDIR}\" ]] || exit 10 && exit 20", NULL, + bash_envp); + exit(errno); + } else if (child_pid > 0) { + waitpid(child_pid, &child_status, 0); + ret = WEXITSTATUS(child_status); + + /* If a secureexec occurred, the exit status should be 20 */ + if (secureexec && ret == 20) + return 0; + + /* If normal execution happened, the exit code should be 10 */ + if (!secureexec && ret == 10) + return 0; + } + + return -EINVAL; +} + +void test_test_bprm_opts(void) +{ + int err, duration = 0; + struct bprm_opts *skel = NULL; + + skel = bprm_opts__open_and_load(); + if (CHECK(!skel, "skel_load", "skeleton failed\n")) + goto close_prog; + + err = bprm_opts__attach(skel); + if (CHECK(err, "attach", "attach failed: %d\n", err)) + goto close_prog; + + /* Run the test with the secureexec bit unset */ + err = run_set_secureexec(bpf_map__fd(skel->maps.secure_exec_task_map), + 0 /* secureexec */); + if (CHECK(err, "run_set_secureexec:0", "err = %d\n", err)) + goto close_prog; + + /* Run the test with the secureexec bit set */ + err = run_set_secureexec(bpf_map__fd(skel->maps.secure_exec_task_map), + 1 /* secureexec */); + if (CHECK(err, "run_set_secureexec:1", "err = %d\n", err)) + goto close_prog; + +close_prog: + bprm_opts__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_ima.c b/tools/testing/selftests/bpf/prog_tests/test_ima.c new file mode 100644 index 000000000000..61fca681d524 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_ima.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2020 Google LLC. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/wait.h> +#include <test_progs.h> + +#include "ima.skel.h" + +static int run_measured_process(const char *measured_dir, u32 *monitored_pid) +{ + int child_pid, child_status; + + child_pid = fork(); + if (child_pid == 0) { + *monitored_pid = getpid(); + execlp("./ima_setup.sh", "./ima_setup.sh", "run", measured_dir, + NULL); + exit(errno); + + } else if (child_pid > 0) { + waitpid(child_pid, &child_status, 0); + return WEXITSTATUS(child_status); + } + + return -EINVAL; +} + +void test_test_ima(void) +{ + char measured_dir_template[] = "/tmp/ima_measuredXXXXXX"; + const char *measured_dir; + char cmd[256]; + + int err, duration = 0; + struct ima *skel = NULL; + + skel = ima__open_and_load(); + if (CHECK(!skel, "skel_load", "skeleton failed\n")) + goto close_prog; + + err = ima__attach(skel); + if (CHECK(err, "attach", "attach failed: %d\n", err)) + goto close_prog; + + measured_dir = mkdtemp(measured_dir_template); + if (CHECK(measured_dir == NULL, "mkdtemp", "err %d\n", errno)) + goto close_prog; + + snprintf(cmd, sizeof(cmd), "./ima_setup.sh setup %s", measured_dir); + if (CHECK_FAIL(system(cmd))) + goto close_clean; + + err = run_measured_process(measured_dir, &skel->bss->monitored_pid); + if (CHECK(err, "run_measured_process", "err = %d\n", err)) + goto close_clean; + + CHECK(skel->data->ima_hash_ret < 0, "ima_hash_ret", + "ima_hash_ret = %ld\n", skel->data->ima_hash_ret); + + CHECK(skel->bss->ima_hash == 0, "ima_hash", + "ima_hash = %lu\n", skel->bss->ima_hash); + +close_clean: + snprintf(cmd, sizeof(cmd), "./ima_setup.sh cleanup %s", measured_dir); + CHECK_FAIL(system(cmd)); +close_prog: + ima__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c index 91cd6f357246..c0fe73a17ed1 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c @@ -4,30 +4,173 @@ * Copyright (C) 2020 Google LLC. */ +#include <asm-generic/errno-base.h> +#include <sys/stat.h> #include <test_progs.h> #include <linux/limits.h> #include "local_storage.skel.h" #include "network_helpers.h" -int create_and_unlink_file(void) +#ifndef __NR_pidfd_open +#define __NR_pidfd_open 434 +#endif + +static inline int sys_pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static unsigned int duration; + +#define TEST_STORAGE_VALUE 0xbeefdead + +struct storage { + void *inode; + unsigned int value; + /* Lock ensures that spin locked versions of local stoage operations + * also work, most operations in this tests are still single threaded + */ + struct bpf_spin_lock lock; +}; + +/* Copies an rm binary to a temp file. dest is a mkstemp template */ +static int copy_rm(char *dest) { - char fname[PATH_MAX] = "/tmp/fileXXXXXX"; - int fd; + int fd_in, fd_out = -1, ret = 0; + struct stat stat; + char *buf = NULL; + + fd_in = open("/bin/rm", O_RDONLY); + if (fd_in < 0) + return -errno; + + fd_out = mkstemp(dest); + if (fd_out < 0) { + ret = -errno; + goto out; + } + + ret = fstat(fd_in, &stat); + if (ret == -1) { + ret = -errno; + goto out; + } + + buf = malloc(stat.st_blksize); + if (!buf) { + ret = -errno; + goto out; + } + + while (ret = read(fd_in, buf, stat.st_blksize), ret > 0) { + ret = write(fd_out, buf, ret); + if (ret < 0) { + ret = -errno; + goto out; + + } + } + if (ret < 0) { + ret = -errno; + goto out; + + } + + /* Set executable permission on the copied file */ + ret = chmod(dest, 0100); + if (ret == -1) + ret = -errno; + +out: + free(buf); + close(fd_in); + close(fd_out); + return ret; +} - fd = mkstemp(fname); - if (fd < 0) - return fd; +/* Fork and exec the provided rm binary and return the exit code of the + * forked process and its pid. + */ +static int run_self_unlink(int *monitored_pid, const char *rm_path) +{ + int child_pid, child_status, ret; + int null_fd; + + child_pid = fork(); + if (child_pid == 0) { + null_fd = open("/dev/null", O_WRONLY); + dup2(null_fd, STDOUT_FILENO); + dup2(null_fd, STDERR_FILENO); + close(null_fd); + + *monitored_pid = getpid(); + /* Use the copied /usr/bin/rm to delete itself + * /tmp/copy_of_rm /tmp/copy_of_rm. + */ + ret = execlp(rm_path, rm_path, rm_path, NULL); + if (ret) + exit(errno); + } else if (child_pid > 0) { + waitpid(child_pid, &child_status, 0); + return WEXITSTATUS(child_status); + } + + return -EINVAL; +} - close(fd); - unlink(fname); - return 0; +static bool check_syscall_operations(int map_fd, int obj_fd) +{ + struct storage val = { .value = TEST_STORAGE_VALUE, .lock = { 0 } }, + lookup_val = { .value = 0, .lock = { 0 } }; + int err; + + /* Looking up an existing element should fail initially */ + err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val, + BPF_F_LOCK); + if (CHECK(!err || errno != ENOENT, "bpf_map_lookup_elem", + "err:%d errno:%d\n", err, errno)) + return false; + + /* Create a new element */ + err = bpf_map_update_elem(map_fd, &obj_fd, &val, + BPF_NOEXIST | BPF_F_LOCK); + if (CHECK(err < 0, "bpf_map_update_elem", "err:%d errno:%d\n", err, + errno)) + return false; + + /* Lookup the newly created element */ + err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val, + BPF_F_LOCK); + if (CHECK(err < 0, "bpf_map_lookup_elem", "err:%d errno:%d", err, + errno)) + return false; + + /* Check the value of the newly created element */ + if (CHECK(lookup_val.value != val.value, "bpf_map_lookup_elem", + "value got = %x errno:%d", lookup_val.value, val.value)) + return false; + + err = bpf_map_delete_elem(map_fd, &obj_fd); + if (CHECK(err, "bpf_map_delete_elem()", "err:%d errno:%d\n", err, + errno)) + return false; + + /* The lookup should fail, now that the element has been deleted */ + err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val, + BPF_F_LOCK); + if (CHECK(!err || errno != ENOENT, "bpf_map_lookup_elem", + "err:%d errno:%d\n", err, errno)) + return false; + + return true; } void test_test_local_storage(void) { + char tmp_exec_path[PATH_MAX] = "/tmp/copy_of_rmXXXXXX"; + int err, serv_sk = -1, task_fd = -1, rm_fd = -1; struct local_storage *skel = NULL; - int err, duration = 0, serv_sk = -1; skel = local_storage__open_and_load(); if (CHECK(!skel, "skel_load", "lsm skeleton failed\n")) @@ -37,12 +180,46 @@ void test_test_local_storage(void) if (CHECK(err, "attach", "lsm attach failed: %d\n", err)) goto close_prog; - skel->bss->monitored_pid = getpid(); + task_fd = sys_pidfd_open(getpid(), 0); + if (CHECK(task_fd < 0, "pidfd_open", + "failed to get pidfd err:%d, errno:%d", task_fd, errno)) + goto close_prog; - err = create_and_unlink_file(); - if (CHECK(err < 0, "exec_cmd", "err %d errno %d\n", err, errno)) + if (!check_syscall_operations(bpf_map__fd(skel->maps.task_storage_map), + task_fd)) goto close_prog; + err = copy_rm(tmp_exec_path); + if (CHECK(err < 0, "copy_rm", "err %d errno %d\n", err, errno)) + goto close_prog; + + rm_fd = open(tmp_exec_path, O_RDONLY); + if (CHECK(rm_fd < 0, "open", "failed to open %s err:%d, errno:%d", + tmp_exec_path, rm_fd, errno)) + goto close_prog; + + if (!check_syscall_operations(bpf_map__fd(skel->maps.inode_storage_map), + rm_fd)) + goto close_prog; + + /* Sets skel->bss->monitored_pid to the pid of the forked child + * forks a child process that executes tmp_exec_path and tries to + * unlink its executable. This operation should be denied by the loaded + * LSM program. + */ + err = run_self_unlink(&skel->bss->monitored_pid, tmp_exec_path); + if (CHECK(err != EPERM, "run_self_unlink", "err %d want EPERM\n", err)) + goto close_prog_unlink; + + /* Set the process being monitored to be the current process */ + skel->bss->monitored_pid = getpid(); + + /* Remove the temporary created executable */ + err = unlink(tmp_exec_path); + if (CHECK(err != 0, "unlink", "unable to unlink %s: %d", tmp_exec_path, + errno)) + goto close_prog_unlink; + CHECK(skel->data->inode_storage_result != 0, "inode_storage_result", "inode_local_storage not set\n"); @@ -53,8 +230,15 @@ void test_test_local_storage(void) CHECK(skel->data->sk_storage_result != 0, "sk_storage_result", "sk_local_storage not set\n"); - close(serv_sk); + if (!check_syscall_operations(bpf_map__fd(skel->maps.sk_storage_map), + serv_sk)) + goto close_prog; +close_prog_unlink: + unlink(tmp_exec_path); close_prog: + close(serv_sk); + close(rm_fd); + close(task_fd); local_storage__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_skb_pkt_end.c b/tools/testing/selftests/bpf/prog_tests/test_skb_pkt_end.c new file mode 100644 index 000000000000..cf1215531920 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_skb_pkt_end.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <test_progs.h> +#include <network_helpers.h> +#include "skb_pkt_end.skel.h" + +static int sanity_run(struct bpf_program *prog) +{ + __u32 duration, retval; + int err, prog_fd; + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, &retval, &duration); + if (CHECK(err || retval != 123, "test_run", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration)) + return -1; + return 0; +} + +void test_test_skb_pkt_end(void) +{ + struct skb_pkt_end *skb_pkt_end_skel = NULL; + __u32 duration = 0; + int err; + + skb_pkt_end_skel = skb_pkt_end__open_and_load(); + if (CHECK(!skb_pkt_end_skel, "skb_pkt_end_skel_load", "skb_pkt_end skeleton failed\n")) + goto cleanup; + + err = skb_pkt_end__attach(skb_pkt_end_skel); + if (CHECK(err, "skb_pkt_end_attach", "skb_pkt_end attach failed: %d\n", err)) + goto cleanup; + + if (sanity_run(skb_pkt_end_skel->progs.main_prog)) + goto cleanup; + +cleanup: + skb_pkt_end__destroy(skb_pkt_end_skel); +} diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c new file mode 100644 index 000000000000..c6520f21f5f5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <string.h> + +#include <linux/stddef.h> +#include <linux/bpf.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <sys/socket.h> +#include <netinet/tcp.h> +#include <linux/if.h> +#include <errno.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define SERV4_IP 0xc0a801feU /* 192.168.1.254 */ +#define SERV4_PORT 4040 +#define SERV4_REWRITE_IP 0x7f000001U /* 127.0.0.1 */ +#define SERV4_REWRITE_PORT 4444 + +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif + +static __inline int bind_to_device(struct bpf_sock_addr *ctx) +{ + char veth1[IFNAMSIZ] = "test_sock_addr1"; + char veth2[IFNAMSIZ] = "test_sock_addr2"; + char missing[IFNAMSIZ] = "nonexistent_dev"; + char del_bind[IFNAMSIZ] = ""; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth1, sizeof(veth1))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth2, sizeof(veth2))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &missing, sizeof(missing)) != -ENODEV) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &del_bind, sizeof(del_bind))) + return 1; + + return 0; +} + +SEC("cgroup/bind4") +int bind_v4_prog(struct bpf_sock_addr *ctx) +{ + struct bpf_sock *sk; + __u32 user_ip4; + __u16 user_port; + + sk = ctx->sk; + if (!sk) + return 0; + + if (sk->family != AF_INET) + return 0; + + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) + return 0; + + if (ctx->user_ip4 != bpf_htonl(SERV4_IP) || + ctx->user_port != bpf_htons(SERV4_PORT)) + return 0; + + // u8 narrow loads: + user_ip4 = 0; + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[0] << 0; + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[1] << 8; + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[2] << 16; + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[3] << 24; + if (ctx->user_ip4 != user_ip4) + return 0; + + user_port = 0; + user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; + user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + if (ctx->user_port != user_port) + return 0; + + // u16 narrow loads: + user_ip4 = 0; + user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[0] << 0; + user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[1] << 16; + if (ctx->user_ip4 != user_ip4) + return 0; + + /* Bind to device and unbind it. */ + if (bind_to_device(ctx)) + return 0; + + ctx->user_ip4 = bpf_htonl(SERV4_REWRITE_IP); + ctx->user_port = bpf_htons(SERV4_REWRITE_PORT); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c new file mode 100644 index 000000000000..4358e44dcf47 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <string.h> + +#include <linux/stddef.h> +#include <linux/bpf.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <sys/socket.h> +#include <netinet/tcp.h> +#include <linux/if.h> +#include <errno.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define SERV6_IP_0 0xfaceb00c /* face:b00c:1234:5678::abcd */ +#define SERV6_IP_1 0x12345678 +#define SERV6_IP_2 0x00000000 +#define SERV6_IP_3 0x0000abcd +#define SERV6_PORT 6060 +#define SERV6_REWRITE_IP_0 0x00000000 +#define SERV6_REWRITE_IP_1 0x00000000 +#define SERV6_REWRITE_IP_2 0x00000000 +#define SERV6_REWRITE_IP_3 0x00000001 +#define SERV6_REWRITE_PORT 6666 + +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif + +static __inline int bind_to_device(struct bpf_sock_addr *ctx) +{ + char veth1[IFNAMSIZ] = "test_sock_addr1"; + char veth2[IFNAMSIZ] = "test_sock_addr2"; + char missing[IFNAMSIZ] = "nonexistent_dev"; + char del_bind[IFNAMSIZ] = ""; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth1, sizeof(veth1))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth2, sizeof(veth2))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &missing, sizeof(missing)) != -ENODEV) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &del_bind, sizeof(del_bind))) + return 1; + + return 0; +} + +SEC("cgroup/bind6") +int bind_v6_prog(struct bpf_sock_addr *ctx) +{ + struct bpf_sock *sk; + __u32 user_ip6; + __u16 user_port; + int i; + + sk = ctx->sk; + if (!sk) + return 0; + + if (sk->family != AF_INET6) + return 0; + + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) + return 0; + + if (ctx->user_ip6[0] != bpf_htonl(SERV6_IP_0) || + ctx->user_ip6[1] != bpf_htonl(SERV6_IP_1) || + ctx->user_ip6[2] != bpf_htonl(SERV6_IP_2) || + ctx->user_ip6[3] != bpf_htonl(SERV6_IP_3) || + ctx->user_port != bpf_htons(SERV6_PORT)) + return 0; + + // u8 narrow loads: + for (i = 0; i < 4; i++) { + user_ip6 = 0; + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[0] << 0; + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[1] << 8; + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[2] << 16; + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[3] << 24; + if (ctx->user_ip6[i] != user_ip6) + return 0; + } + + user_port = 0; + user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; + user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + if (ctx->user_port != user_port) + return 0; + + // u16 narrow loads: + for (i = 0; i < 4; i++) { + user_ip6 = 0; + user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[0] << 0; + user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[1] << 16; + if (ctx->user_ip6[i] != user_ip6) + return 0; + } + + /* Bind to device and unbind it. */ + if (bind_to_device(ctx)) + return 0; + + ctx->user_ip6[0] = bpf_htonl(SERV6_REWRITE_IP_0); + ctx->user_ip6[1] = bpf_htonl(SERV6_REWRITE_IP_1); + ctx->user_ip6[2] = bpf_htonl(SERV6_REWRITE_IP_2); + ctx->user_ip6[3] = bpf_htonl(SERV6_REWRITE_IP_3); + ctx->user_port = bpf_htons(SERV6_REWRITE_PORT); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 5a65f6b51377..95a5a0778ed7 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -368,6 +368,8 @@ PROG(IPV6FR)(struct __sk_buff *skb) */ if (!(keys->flags & BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) return export_flow_keys(keys, BPF_OK); + } else { + return export_flow_keys(keys, BPF_OK); } return parse_ipv6_proto(skb, fragh->nexthdr); diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c index 08651b23edba..b83b5d2e17dc 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c @@ -23,6 +23,6 @@ int dump_bpf_map(struct bpf_iter__bpf_map *ctx) BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter, map->usercnt.counter, - map->memory.user->locked_vm.counter); + 0LLU); return 0; } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c new file mode 100644 index 000000000000..6cecab2b32ba --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Google LLC. */ +#include "bpf_iter.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} sk_stg_map SEC(".maps"); + +SEC("iter/bpf_sk_storage_map") +int delete_bpf_sk_storage_map(struct bpf_iter__bpf_sk_storage_map *ctx) +{ + if (ctx->sk) + bpf_sk_storage_delete(&sk_stg_map, ctx->sk); + + return 0; +} + +SEC("iter/task_file") +int fill_socket_owner(struct bpf_iter__task_file *ctx) +{ + struct task_struct *task = ctx->task; + struct file *file = ctx->file; + struct socket *sock; + int *sock_tgid; + + if (!task || !file) + return 0; + + sock = bpf_sock_from_file(file); + if (!sock) + return 0; + + sock_tgid = bpf_sk_storage_get(&sk_stg_map, sock->sk, 0, 0); + if (!sock_tgid) + return 0; + + *sock_tgid = task->tgid; + + return 0; +} + +SEC("iter/tcp") +int negate_socket_local_storage(struct bpf_iter__tcp *ctx) +{ + struct sock_common *sk_common = ctx->sk_common; + int *sock_tgid; + + if (!sk_common) + return 0; + + sock_tgid = bpf_sk_storage_get(&sk_stg_map, sk_common, 0, 0); + if (!sock_tgid) + return 0; + + *sock_tgid = -*sock_tgid; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c index 4983087852a0..b7f32c160f4e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c @@ -11,9 +11,10 @@ int dump_task(struct bpf_iter__task *ctx) { struct seq_file *seq = ctx->meta->seq; struct task_struct *task = ctx->task; + static char info[] = " === END ==="; if (task == (void *)0) { - BPF_SEQ_PRINTF(seq, " === END ===\n"); + BPF_SEQ_PRINTF(seq, "%s\n", info); return 0; } diff --git a/tools/testing/selftests/bpf/progs/bprm_opts.c b/tools/testing/selftests/bpf/progs/bprm_opts.c new file mode 100644 index 000000000000..5bfef2887e70 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bprm_opts.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 Google LLC. + */ + +#include "vmlinux.h" +#include <errno.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} secure_exec_task_map SEC(".maps"); + +SEC("lsm/bprm_creds_for_exec") +int BPF_PROG(secure_exec, struct linux_binprm *bprm) +{ + int *secureexec; + + secureexec = bpf_task_storage_get(&secure_exec_task_map, + bpf_get_current_task_btf(), 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + + if (secureexec && *secureexec) + bpf_bprm_opts_set(bprm, BPF_F_BPRM_SECUREEXEC); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index e6e616cb7bc9..9a2850850121 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -16,6 +16,23 @@ struct core_reloc_kernel_output { }; /* + * MODULE + */ + +struct core_reloc_module_output { + long long len; + long long off; + int read_ctx_sz; + bool read_ctx_exists; + bool buf_exists; + bool len_exists; + bool off_exists; + /* we have test_progs[-flavor], so cut flavor part */ + char comm[sizeof("test_progs")]; + int comm_len; +}; + +/* * FLAVORS */ struct core_reloc_flavors { diff --git a/tools/testing/selftests/bpf/progs/ima.c b/tools/testing/selftests/bpf/progs/ima.c new file mode 100644 index 000000000000..86b21aff4bc5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/ima.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 Google LLC. + */ + +#include "vmlinux.h" +#include <errno.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +long ima_hash_ret = -1; +u64 ima_hash = 0; +u32 monitored_pid = 0; + +char _license[] SEC("license") = "GPL"; + +SEC("lsm.s/bprm_committed_creds") +int BPF_PROG(ima, struct linux_binprm *bprm) +{ + u32 pid = bpf_get_current_pid_tgid() >> 32; + + if (pid == monitored_pid) + ima_hash_ret = bpf_ima_inode_hash(bprm->file->f_inode, + &ima_hash, sizeof(ima_hash)); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index 0758ba229ae0..3e3de130f28f 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -4,9 +4,8 @@ * Copyright 2020 Google LLC. */ +#include "vmlinux.h" #include <errno.h> -#include <linux/bpf.h> -#include <stdbool.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> @@ -18,60 +17,68 @@ int monitored_pid = 0; int inode_storage_result = -1; int sk_storage_result = -1; -struct dummy_storage { +struct local_storage { + struct inode *exec_inode; __u32 value; + struct bpf_spin_lock lock; }; struct { __uint(type, BPF_MAP_TYPE_INODE_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); - __type(value, struct dummy_storage); + __type(value, struct local_storage); } inode_storage_map SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_SK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE); __type(key, int); - __type(value, struct dummy_storage); + __type(value, struct local_storage); } sk_storage_map SEC(".maps"); -/* TODO Use vmlinux.h once BTF pruning for embedded types is fixed. - */ -struct sock {} __attribute__((preserve_access_index)); -struct sockaddr {} __attribute__((preserve_access_index)); -struct socket { - struct sock *sk; -} __attribute__((preserve_access_index)); - -struct inode {} __attribute__((preserve_access_index)); -struct dentry { - struct inode *d_inode; -} __attribute__((preserve_access_index)); -struct file { - struct inode *f_inode; -} __attribute__((preserve_access_index)); - +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct local_storage); +} task_storage_map SEC(".maps"); SEC("lsm/inode_unlink") int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) { __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct dummy_storage *storage; + struct local_storage *storage; + bool is_self_unlink; + int err; if (pid != monitored_pid) return 0; + storage = bpf_task_storage_get(&task_storage_map, + bpf_get_current_task_btf(), 0, 0); + if (storage) { + /* Don't let an executable delete itself */ + bpf_spin_lock(&storage->lock); + is_self_unlink = storage->exec_inode == victim->d_inode; + bpf_spin_unlock(&storage->lock); + if (is_self_unlink) + return -EPERM; + } + storage = bpf_inode_storage_get(&inode_storage_map, victim->d_inode, 0, - BPF_SK_STORAGE_GET_F_CREATE); + BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (storage->value == DUMMY_STORAGE_VALUE) + bpf_spin_lock(&storage->lock); + if (storage->value != DUMMY_STORAGE_VALUE) inode_storage_result = -1; + bpf_spin_unlock(&storage->lock); - inode_storage_result = - bpf_inode_storage_delete(&inode_storage_map, victim->d_inode); + err = bpf_inode_storage_delete(&inode_storage_map, victim->d_inode); + if (!err) + inode_storage_result = err; return 0; } @@ -81,20 +88,26 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, int addrlen) { __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct dummy_storage *storage; + struct local_storage *storage; + int err; if (pid != monitored_pid) return 0; storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, - BPF_SK_STORAGE_GET_F_CREATE); + BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (storage->value == DUMMY_STORAGE_VALUE) + bpf_spin_lock(&storage->lock); + if (storage->value != DUMMY_STORAGE_VALUE) sk_storage_result = -1; + bpf_spin_unlock(&storage->lock); + + err = bpf_sk_storage_delete(&sk_storage_map, sock->sk); + if (!err) + sk_storage_result = err; - sk_storage_result = bpf_sk_storage_delete(&sk_storage_map, sock->sk); return 0; } @@ -103,17 +116,19 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, int protocol, int kern) { __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct dummy_storage *storage; + struct local_storage *storage; if (pid != monitored_pid) return 0; storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, - BPF_SK_STORAGE_GET_F_CREATE); + BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; + bpf_spin_lock(&storage->lock); storage->value = DUMMY_STORAGE_VALUE; + bpf_spin_unlock(&storage->lock); return 0; } @@ -122,7 +137,7 @@ SEC("lsm/file_open") int BPF_PROG(file_open, struct file *file) { __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct dummy_storage *storage; + struct local_storage *storage; if (pid != monitored_pid) return 0; @@ -131,10 +146,30 @@ int BPF_PROG(file_open, struct file *file) return 0; storage = bpf_inode_storage_get(&inode_storage_map, file->f_inode, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); + BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; + bpf_spin_lock(&storage->lock); storage->value = DUMMY_STORAGE_VALUE; + bpf_spin_unlock(&storage->lock); return 0; } + +/* This uses the local storage to remember the inode of the binary that a + * process was originally executing. + */ +SEC("lsm/bprm_committed_creds") +void BPF_PROG(exec, struct linux_binprm *bprm) +{ + struct local_storage *storage; + + storage = bpf_task_storage_get(&task_storage_map, + bpf_get_current_task_btf(), 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (storage) { + bpf_spin_lock(&storage->lock); + storage->exec_inode = bprm->file->f_inode; + bpf_spin_unlock(&storage->lock); + } +} diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index c325405751e2..d8850bc6a9f1 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -26,17 +26,12 @@ __u32 g_line = 0; return 0; \ }) -struct bpf_map_memory { - __u32 pages; -} __attribute__((preserve_access_index)); - struct bpf_map { enum bpf_map_type map_type; __u32 key_size; __u32 value_size; __u32 max_entries; __u32 id; - struct bpf_map_memory memory; } __attribute__((preserve_access_index)); static inline int check_bpf_map_fields(struct bpf_map *map, __u32 key_size, @@ -47,7 +42,6 @@ static inline int check_bpf_map_fields(struct bpf_map *map, __u32 key_size, VERIFY(map->value_size == value_size); VERIFY(map->max_entries == max_entries); VERIFY(map->id > 0); - VERIFY(map->memory.pages > 0); return 1; } @@ -60,7 +54,6 @@ static inline int check_bpf_map_ptr(struct bpf_map *indirect, VERIFY(indirect->value_size == direct->value_size); VERIFY(indirect->max_entries == direct->max_entries); VERIFY(indirect->id == direct->id); - VERIFY(indirect->memory.pages == direct->memory.pages); return 1; } diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index 30982a7e4d0f..4896fdf816f7 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -256,6 +256,7 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn); struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn); +#if __has_builtin(__builtin_preserve_enum_value) if (ENABLE_CGROUP_V1_RESOLVER && CONFIG_CGROUP_PIDS) { int cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id___local, pids_cgrp_id___local); @@ -275,6 +276,7 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, } } } +#endif cgroup_data->cgroup_root_inode = get_inode_from_kernfs(root_kernfs); cgroup_data->cgroup_proc_inode = get_inode_from_kernfs(proc_kernfs); diff --git a/tools/testing/selftests/bpf/progs/skb_pkt_end.c b/tools/testing/selftests/bpf/progs/skb_pkt_end.c new file mode 100644 index 000000000000..cf6823f42e80 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/skb_pkt_end.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +#define BPF_NO_PRESERVE_ACCESS_INDEX +#include <vmlinux.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> + +#define NULL 0 +#define INLINE __always_inline + +#define skb_shorter(skb, len) ((void *)(long)(skb)->data + (len) > (void *)(long)skb->data_end) + +#define ETH_IPV4_TCP_SIZE (14 + sizeof(struct iphdr) + sizeof(struct tcphdr)) + +static INLINE struct iphdr *get_iphdr(struct __sk_buff *skb) +{ + struct iphdr *ip = NULL; + struct ethhdr *eth; + + if (skb_shorter(skb, ETH_IPV4_TCP_SIZE)) + goto out; + + eth = (void *)(long)skb->data; + ip = (void *)(eth + 1); + +out: + return ip; +} + +SEC("classifier/cls") +int main_prog(struct __sk_buff *skb) +{ + struct iphdr *ip = NULL; + struct tcphdr *tcp; + __u8 proto = 0; + + if (!(ip = get_iphdr(skb))) + goto out; + + proto = ip->protocol; + + if (proto != IPPROTO_TCP) + goto out; + + tcp = (void*)(ip + 1); + if (tcp->dest != 0) + goto out; + if (!tcp) + goto out; + + return tcp->urg_ptr; +out: + return -1; +} +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_module.c b/tools/testing/selftests/bpf/progs/test_core_reloc_module.c new file mode 100644 index 000000000000..f59f175c7baf --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_module.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_testmod_test_read_ctx { + /* field order is mixed up */ + size_t len; + char *buf; + loff_t off; +} __attribute__((preserve_access_index)); + +struct { + char in[256]; + char out[256]; + bool skip; + uint64_t my_pid_tgid; +} data = {}; + +struct core_reloc_module_output { + long long len; + long long off; + int read_ctx_sz; + bool read_ctx_exists; + bool buf_exists; + bool len_exists; + bool off_exists; + /* we have test_progs[-flavor], so cut flavor part */ + char comm[sizeof("test_progs")]; + int comm_len; +}; + +SEC("raw_tp/bpf_testmod_test_read") +int BPF_PROG(test_core_module_probed, + struct task_struct *task, + struct bpf_testmod_test_read_ctx *read_ctx) +{ +#if __has_builtin(__builtin_preserve_enum_value) + struct core_reloc_module_output *out = (void *)&data.out; + __u64 pid_tgid = bpf_get_current_pid_tgid(); + __u32 real_tgid = (__u32)(pid_tgid >> 32); + __u32 real_pid = (__u32)pid_tgid; + + if (data.my_pid_tgid != pid_tgid) + return 0; + + if (BPF_CORE_READ(task, pid) != real_pid || BPF_CORE_READ(task, tgid) != real_tgid) + return 0; + + out->len = BPF_CORE_READ(read_ctx, len); + out->off = BPF_CORE_READ(read_ctx, off); + + out->read_ctx_sz = bpf_core_type_size(struct bpf_testmod_test_read_ctx); + out->read_ctx_exists = bpf_core_type_exists(struct bpf_testmod_test_read_ctx); + out->buf_exists = bpf_core_field_exists(read_ctx->buf); + out->off_exists = bpf_core_field_exists(read_ctx->off); + out->len_exists = bpf_core_field_exists(read_ctx->len); + + out->comm_len = BPF_CORE_READ_STR_INTO(&out->comm, task, comm); +#else + data.skip = true; +#endif + + return 0; +} + +SEC("tp_btf/bpf_testmod_test_read") +int BPF_PROG(test_core_module_direct, + struct task_struct *task, + struct bpf_testmod_test_read_ctx *read_ctx) +{ +#if __has_builtin(__builtin_preserve_enum_value) + struct core_reloc_module_output *out = (void *)&data.out; + __u64 pid_tgid = bpf_get_current_pid_tgid(); + __u32 real_tgid = (__u32)(pid_tgid >> 32); + __u32 real_pid = (__u32)pid_tgid; + + if (data.my_pid_tgid != pid_tgid) + return 0; + + if (task->pid != real_pid || task->tgid != real_tgid) + return 0; + + out->len = read_ctx->len; + out->off = read_ctx->off; + + out->read_ctx_sz = bpf_core_type_size(struct bpf_testmod_test_read_ctx); + out->read_ctx_exists = bpf_core_type_exists(struct bpf_testmod_test_read_ctx); + out->buf_exists = bpf_core_field_exists(read_ctx->buf); + out->off_exists = bpf_core_field_exists(read_ctx->off); + out->len_exists = bpf_core_field_exists(read_ctx->len); + + out->comm_len = BPF_CORE_READ_STR_INTO(&out->comm, task, comm); +#else + data.skip = true; +#endif + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_hash_large_key.c b/tools/testing/selftests/bpf/progs/test_hash_large_key.c new file mode 100644 index 000000000000..473a22794a62 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_hash_large_key.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 2); + __type(key, struct bigelement); + __type(value, __u32); +} hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct bigelement); +} key_map SEC(".maps"); + +struct bigelement { + int a; + char b[4096]; + long long c; +}; + +SEC("raw_tracepoint/sys_enter") +int bpf_hash_large_key_test(void *ctx) +{ + int zero = 0, err = 1, value = 42; + struct bigelement *key; + + key = bpf_map_lookup_elem(&key_map, &zero); + if (!key) + return 0; + + key->c = 1; + if (bpf_map_update_elem(&hash_map, key, &value, BPF_ANY)) + return 0; + + return 0; +} + diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c new file mode 100644 index 000000000000..efd1e287ac17 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "../bpf_testmod/bpf_testmod.h" + +__u32 raw_tp_read_sz = 0; + +SEC("raw_tp/bpf_testmod_test_read") +int BPF_PROG(handle_raw_tp, + struct task_struct *task, struct bpf_testmod_test_read_ctx *read_ctx) +{ + raw_tp_read_sz = BPF_CORE_READ(read_ctx, len); + return 0; +} + +__u32 tp_btf_read_sz = 0; + +SEC("tp_btf/bpf_testmod_test_read") +int BPF_PROG(handle_tp_btf, + struct task_struct *task, struct bpf_testmod_test_read_ctx *read_ctx) +{ + tp_btf_read_sz = read_ctx->len; + return 0; +} + +__u32 fentry_read_sz = 0; + +SEC("fentry/bpf_testmod_test_read") +int BPF_PROG(handle_fentry, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_read_sz = len; + return 0; +} + +__u32 fentry_manual_read_sz = 0; + +SEC("fentry/placeholder") +int BPF_PROG(handle_fentry_manual, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_manual_read_sz = len; + return 0; +} + +__u32 fexit_read_sz = 0; +int fexit_ret = 0; + +SEC("fexit/bpf_testmod_test_read") +int BPF_PROG(handle_fexit, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len, + int ret) +{ + fexit_read_sz = len; + fexit_ret = ret; + return 0; +} + +__u32 fmod_ret_read_sz = 0; + +SEC("fmod_ret/bpf_testmod_test_read") +int BPF_PROG(handle_fmod_ret, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fmod_ret_read_sz = len; + return 0; /* don't override the exit code */ +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_trace_itself.c b/tools/testing/selftests/bpf/progs/test_sk_storage_trace_itself.c new file mode 100644 index 000000000000..59ef72d02a61 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sk_storage_trace_itself.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} sk_stg_map SEC(".maps"); + +SEC("fentry/bpf_sk_storage_free") +int BPF_PROG(trace_bpf_sk_storage_free, struct sock *sk) +{ + int *value; + + value = bpf_sk_storage_get(&sk_stg_map, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + + if (value) + *value = 1; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c new file mode 100644 index 000000000000..8e94e5c080aa --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> + +struct sk_stg { + __u32 pid; + __u32 last_notclose_state; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct sk_stg); +} sk_stg_map SEC(".maps"); + +/* Testing delete */ +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} del_sk_stg_map SEC(".maps"); + +char task_comm[16] = ""; + +SEC("tp_btf/inet_sock_set_state") +int BPF_PROG(trace_inet_sock_set_state, struct sock *sk, int oldstate, + int newstate) +{ + struct sk_stg *stg; + + if (newstate == BPF_TCP_CLOSE) + return 0; + + stg = bpf_sk_storage_get(&sk_stg_map, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!stg) + return 0; + + stg->last_notclose_state = newstate; + + bpf_sk_storage_delete(&del_sk_stg_map, sk); + + return 0; +} + +static void set_task_info(struct sock *sk) +{ + struct task_struct *task; + struct sk_stg *stg; + + stg = bpf_sk_storage_get(&sk_stg_map, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!stg) + return; + + stg->pid = bpf_get_current_pid_tgid(); + + task = (struct task_struct *)bpf_get_current_task(); + bpf_core_read_str(&stg->comm, sizeof(stg->comm), &task->comm); + bpf_core_read_str(&task_comm, sizeof(task_comm), &task->comm); +} + +SEC("fentry/inet_csk_listen_start") +int BPF_PROG(trace_inet_csk_listen_start, struct sock *sk, int backlog) +{ + set_task_info(sk); + + return 0; +} + +SEC("fentry/tcp_connect") +int BPF_PROG(trace_tcp_connect, struct sock *sk) +{ + set_task_info(sk); + + return 0; +} + +SEC("fexit/inet_csk_accept") +int BPF_PROG(inet_csk_accept, struct sock *sk, int flags, int *err, bool kern, + struct sock *accepted_sk) +{ + set_task_info(accepted_sk); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index 3e6912e4df3d..94f50f7e94d6 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -12,50 +12,41 @@ #include <linux/tcp.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include "bpf_tcp_helpers.h" #include "test_tcpbpf.h" -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 4); - __type(key, __u32); - __type(value, struct tcpbpf_globals); -} global_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 2); - __type(key, __u32); - __type(value, int); -} sockopt_results SEC(".maps"); - -static inline void update_event_map(int event) +struct tcpbpf_globals global = {}; +int _version SEC("version") = 1; + +/** + * SOL_TCP is defined in <netinet/tcp.h> while + * TCP_SAVED_SYN is defined in already included <linux/tcp.h> + */ +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + +static __always_inline int get_tp_window_clamp(struct bpf_sock_ops *skops) { - __u32 key = 0; - struct tcpbpf_globals g, *gp; - - gp = bpf_map_lookup_elem(&global_map, &key); - if (gp == NULL) { - struct tcpbpf_globals g = {0}; - - g.event_map |= (1 << event); - bpf_map_update_elem(&global_map, &key, &g, - BPF_ANY); - } else { - g = *gp; - g.event_map |= (1 << event); - bpf_map_update_elem(&global_map, &key, &g, - BPF_ANY); - } + struct bpf_sock *sk; + struct tcp_sock *tp; + + sk = skops->sk; + if (!sk) + return -1; + tp = bpf_skc_to_tcp_sock(sk); + if (!tp) + return -1; + return tp->window_clamp; } -int _version SEC("version") = 1; - SEC("sockops") int bpf_testcb(struct bpf_sock_ops *skops) { char header[sizeof(struct ipv6hdr) + sizeof(struct tcphdr)]; struct bpf_sock_ops *reuse = skops; struct tcphdr *thdr; + int window_clamp = 9216; int good_call_rv = 0; int bad_call_rv = 0; int save_syn = 1; @@ -105,29 +96,20 @@ int bpf_testcb(struct bpf_sock_ops *skops) op = (int) skops->op; - update_event_map(op); + global.event_map |= (1 << op); switch (op) { + case BPF_SOCK_OPS_TCP_CONNECT_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_WINDOW_CLAMP, + &window_clamp, sizeof(window_clamp)); + global.window_clamp_client = get_tp_window_clamp(skops); + break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: /* Test failure to set largest cb flag (assumes not defined) */ - bad_call_rv = bpf_sock_ops_cb_flags_set(skops, 0x80); + global.bad_cb_test_rv = bpf_sock_ops_cb_flags_set(skops, 0x80); /* Set callback */ - good_call_rv = bpf_sock_ops_cb_flags_set(skops, + global.good_cb_test_rv = bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG); - /* Update results */ - { - __u32 key = 0; - struct tcpbpf_globals g, *gp; - - gp = bpf_map_lookup_elem(&global_map, &key); - if (!gp) - break; - g = *gp; - g.bad_cb_test_rv = bad_call_rv; - g.good_cb_test_rv = good_call_rv; - bpf_map_update_elem(&global_map, &key, &g, - BPF_ANY); - } break; case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: skops->sk_txhash = 0x12345f; @@ -143,12 +125,14 @@ int bpf_testcb(struct bpf_sock_ops *skops) thdr = (struct tcphdr *)(header + offset); v = thdr->syn; - __u32 key = 1; - bpf_map_update_elem(&sockopt_results, &key, &v, - BPF_ANY); + global.tcp_saved_syn = v; } } + rv = bpf_setsockopt(skops, SOL_TCP, TCP_WINDOW_CLAMP, + &window_clamp, sizeof(window_clamp)); + + global.window_clamp_server = get_tp_window_clamp(skops); break; case BPF_SOCK_OPS_RTO_CB: break; @@ -156,25 +140,16 @@ int bpf_testcb(struct bpf_sock_ops *skops) break; case BPF_SOCK_OPS_STATE_CB: if (skops->args[1] == BPF_TCP_CLOSE) { - __u32 key = 0; - struct tcpbpf_globals g, *gp; - - gp = bpf_map_lookup_elem(&global_map, &key); - if (!gp) - break; - g = *gp; if (skops->args[0] == BPF_TCP_LISTEN) { - g.num_listen++; + global.num_listen++; } else { - g.total_retrans = skops->total_retrans; - g.data_segs_in = skops->data_segs_in; - g.data_segs_out = skops->data_segs_out; - g.bytes_received = skops->bytes_received; - g.bytes_acked = skops->bytes_acked; + global.total_retrans = skops->total_retrans; + global.data_segs_in = skops->data_segs_in; + global.data_segs_out = skops->data_segs_out; + global.bytes_received = skops->bytes_received; + global.bytes_acked = skops->bytes_acked; } - g.num_close_events++; - bpf_map_update_elem(&global_map, &key, &g, - BPF_ANY); + global.num_close_events++; } break; case BPF_SOCK_OPS_TCP_LISTEN_CB: @@ -182,9 +157,7 @@ int bpf_testcb(struct bpf_sock_ops *skops) v = bpf_setsockopt(skops, IPPROTO_TCP, TCP_SAVE_SYN, &save_syn, sizeof(save_syn)); /* Update global map w/ result of setsock opt */ - __u32 key = 0; - - bpf_map_update_elem(&sockopt_results, &key, &v, BPF_ANY); + global.tcp_save_syn = v; break; default: rv = -1; diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index f48dbfe24ddc..a621b58ab079 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -15,7 +15,6 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/types.h> -#include <linux/tcp.h> #include <linux/socket.h> #include <linux/pkt_cls.h> #include <linux/erspan.h> @@ -528,12 +527,11 @@ int _ipip_set_tunnel(struct __sk_buff *skb) struct bpf_tunnel_key key = {}; void *data = (void *)(long)skb->data; struct iphdr *iph = data; - struct tcphdr *tcp = data + sizeof(*iph); void *data_end = (void *)(long)skb->data_end; int ret; /* single length check */ - if (data + sizeof(*iph) + sizeof(*tcp) > data_end) { + if (data + sizeof(*iph) > data_end) { ERROR(1); return TC_ACT_SHOT; } @@ -541,16 +539,6 @@ int _ipip_set_tunnel(struct __sk_buff *skb) key.tunnel_ttl = 64; if (iph->protocol == IPPROTO_ICMP) { key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ - } else { - if (iph->protocol != IPPROTO_TCP || iph->ihl != 5) - return TC_ACT_SHOT; - - if (tcp->dest == bpf_htons(5200)) - key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ - else if (tcp->dest == bpf_htons(5201)) - key.remote_ipv4 = 0xac100165; /* 172.16.1.101 */ - else - return TC_ACT_SHOT; } ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); @@ -585,19 +573,20 @@ int _ipip6_set_tunnel(struct __sk_buff *skb) struct bpf_tunnel_key key = {}; void *data = (void *)(long)skb->data; struct iphdr *iph = data; - struct tcphdr *tcp = data + sizeof(*iph); void *data_end = (void *)(long)skb->data_end; int ret; /* single length check */ - if (data + sizeof(*iph) + sizeof(*tcp) > data_end) { + if (data + sizeof(*iph) > data_end) { ERROR(1); return TC_ACT_SHOT; } __builtin_memset(&key, 0x0, sizeof(key)); - key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */ key.tunnel_ttl = 64; + if (iph->protocol == IPPROTO_ICMP) { + key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */ + } ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6); @@ -634,35 +623,18 @@ int _ip6ip6_set_tunnel(struct __sk_buff *skb) struct bpf_tunnel_key key = {}; void *data = (void *)(long)skb->data; struct ipv6hdr *iph = data; - struct tcphdr *tcp = data + sizeof(*iph); void *data_end = (void *)(long)skb->data_end; int ret; /* single length check */ - if (data + sizeof(*iph) + sizeof(*tcp) > data_end) { + if (data + sizeof(*iph) > data_end) { ERROR(1); return TC_ACT_SHOT; } - key.remote_ipv6[0] = bpf_htonl(0x2401db00); key.tunnel_ttl = 64; - if (iph->nexthdr == 58 /* NEXTHDR_ICMP */) { - key.remote_ipv6[3] = bpf_htonl(1); - } else { - if (iph->nexthdr != 6 /* NEXTHDR_TCP */) { - ERROR(iph->nexthdr); - return TC_ACT_SHOT; - } - - if (tcp->dest == bpf_htons(5200)) { - key.remote_ipv6[3] = bpf_htonl(1); - } else if (tcp->dest == bpf_htons(5201)) { - key.remote_ipv6[3] = bpf_htonl(2); - } else { - ERROR(tcp->dest); - return TC_ACT_SHOT; - } + key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */ } ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py deleted file mode 100755 index bfff82be3fc1..000000000000 --- a/tools/testing/selftests/bpf/tcp_client.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# -# SPDX-License-Identifier: GPL-2.0 -# - -import sys, os, os.path, getopt -import socket, time -import subprocess -import select - -def read(sock, n): - buf = b'' - while len(buf) < n: - rem = n - len(buf) - try: s = sock.recv(rem) - except (socket.error) as e: return b'' - buf += s - return buf - -def send(sock, s): - total = len(s) - count = 0 - while count < total: - try: n = sock.send(s) - except (socket.error) as e: n = 0 - if n == 0: - return count; - count += n - return count - - -serverPort = int(sys.argv[1]) - -# create active socket -sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) -try: - sock.connect(('::1', serverPort)) -except socket.error as e: - sys.exit(1) - -buf = b'' -n = 0 -while n < 1000: - buf += b'+' - n += 1 - -sock.settimeout(1); -n = send(sock, buf) -n = read(sock, 500) -sys.exit(0) diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py deleted file mode 100755 index 42ab8882f00f..000000000000 --- a/tools/testing/selftests/bpf/tcp_server.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 -# -# SPDX-License-Identifier: GPL-2.0 -# - -import sys, os, os.path, getopt -import socket, time -import subprocess -import select - -def read(sock, n): - buf = b'' - while len(buf) < n: - rem = n - len(buf) - try: s = sock.recv(rem) - except (socket.error) as e: return b'' - buf += s - return buf - -def send(sock, s): - total = len(s) - count = 0 - while count < total: - try: n = sock.send(s) - except (socket.error) as e: n = 0 - if n == 0: - return count; - count += n - return count - - -SERVER_PORT = 12877 -MAX_PORTS = 2 - -serverPort = SERVER_PORT -serverSocket = None - -# create passive socket -serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) - -try: serverSocket.bind(('::1', 0)) -except socket.error as msg: - print('bind fails: ' + str(msg)) - -sn = serverSocket.getsockname() -serverPort = sn[1] - -cmdStr = ("./tcp_client.py %d &") % (serverPort) -os.system(cmdStr) - -buf = b'' -n = 0 -while n < 500: - buf += b'.' - n += 1 - -serverSocket.listen(MAX_PORTS) -readList = [serverSocket] - -while True: - readyRead, readyWrite, inError = \ - select.select(readList, [], [], 2) - - if len(readyRead) > 0: - waitCount = 0 - for sock in readyRead: - if sock == serverSocket: - (clientSocket, address) = serverSocket.accept() - address = str(address[0]) - readList.append(clientSocket) - else: - sock.settimeout(1); - s = read(sock, 1000) - n = send(sock, buf) - sock.close() - serverSocket.close() - sys.exit(0) - else: - print('Select timeout!') - sys.exit(1) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 0d92ebcb335d..0ad3e6305ff0 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1223,9 +1223,10 @@ out_map_in_map: static void test_map_large(void) { + struct bigkey { int a; - char b[116]; + char b[4096]; long long c; } key; int fd, i, value; diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 43c9cda199b8..b99bb8ed3ed4 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -184,9 +184,7 @@ def bpftool_prog_list(expected=None, ns=""): def bpftool_map_list(expected=None, ns=""): _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) # Remove the base maps - for m in base_maps: - if m in maps: - maps.remove(m) + maps = [m for m in maps if m not in base_maps and m.get('name') not in base_map_names] if expected is not None: if len(maps) != expected: fail(True, "%d BPF maps loaded, expected %d" % @@ -716,13 +714,11 @@ def test_multi_prog(simdev, sim, obj, modename, modeid): fail(ret == 0, "Replaced one of programs without -force") check_extack(err, "XDP program already attached.", args) - if modename == "" or modename == "drv": - othermode = "" if modename == "drv" else "drv" - start_test("Test multi-attachment XDP - detach...") - ret, _, err = sim.unset_xdp(othermode, force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Removed program with a bad mode") - check_extack(err, "program loaded with different flags.", args) + start_test("Test multi-attachment XDP - remove without mode...") + ret, _, err = sim.unset_xdp("", force=True, + fail=False, include_stderr=True) + fail(ret == 0, "Removed program without a mode flag") + check_extack(err, "More than one program loaded, unset mode is ambiguous.", args) sim.unset_xdp("offload") xdp = sim.ip_link_show(xdp=True)["xdp"] @@ -772,6 +768,9 @@ ret, progs = bpftool("prog", fail=False) skip(ret != 0, "bpftool not installed") base_progs = progs _, base_maps = bpftool("map") +base_map_names = [ + 'pid_iter.rodata' # created on each bpftool invocation +] # Check netdevsim ret, out = cmd("modprobe netdevsim", fail=False) @@ -913,11 +912,18 @@ try: sim.tc_flush_filters() + start_test("Test TC offloads failure...") + sim.dfs["dev/bpf_bind_verifier_accept"] = 0 + ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "TC filter did not reject with TC offloads enabled") + check_verifier_log(err, "[netdevsim] Hello from netdevsim!") + sim.dfs["dev/bpf_bind_verifier_accept"] = 1 + start_test("Test TC offloads work...") ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, fail=False, include_stderr=True) fail(ret != 0, "TC filter did not load with TC offloads enabled") - check_verifier_log(err, "[netdevsim] Hello from netdevsim!") start_test("Test TC offload basics...") dfs = simdev.dfs_get_bound_progs(expected=1) @@ -941,6 +947,7 @@ try: start_test("Test disabling TC offloads is rejected while filters installed...") ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) fail(ret == 0, "Driver should refuse to disable TC offloads with filters installed...") + sim.set_ethtool_tc_offloads(True) start_test("Test qdisc removal frees things...") sim.tc_flush_filters() @@ -999,18 +1006,8 @@ try: fail=False, include_stderr=True) fail(ret == 0, "Replaced XDP program with a program in different mode") check_extack(err, - "native and generic XDP can't be active at the same time.", + "Native and generic XDP can't be active at the same time.", args) - ret, _, err = sim.set_xdp(obj, "", force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Replaced XDP program with a program in different mode") - check_extack(err, "program loaded with different flags.", args) - - start_test("Test XDP prog remove with bad flags...") - ret, _, err = sim.unset_xdp("", force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Removed program with a bad mode") - check_extack(err, "program loaded with different flags.", args) start_test("Test MTU restrictions...") ret, _ = sim.set_mtu(9000, fail=False) @@ -1040,10 +1037,19 @@ try: offload = bpf_pinned("/sys/fs/bpf/offload") ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True) fail(ret == 0, "attached offloaded XDP program to drv") - check_extack(err, "using device-bound program without HW_MODE flag is not supported.", args) + check_extack(err, "Using device-bound program without HW_MODE flag is not supported.", args) rm("/sys/fs/bpf/offload") sim.wait_for_flush() + start_test("Test XDP load failure...") + sim.dfs["dev/bpf_bind_verifier_accept"] = 0 + ret, _, err = bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/offload", + dev=sim['ifname'], fail=False, include_stderr=True) + fail(ret == 0, "verifier should fail on load") + check_verifier_log(err, "[netdevsim] Hello from netdevsim!") + sim.dfs["dev/bpf_bind_verifier_accept"] = 1 + sim.wait_for_flush() + start_test("Test XDP offload...") _, _, err = sim.set_xdp(obj, "offload", verbose=True, include_stderr=True) ipl = sim.ip_link_show(xdp=True) @@ -1051,7 +1057,6 @@ try: progs = bpftool_prog_list(expected=1) prog = progs[0] fail(link_xdp["id"] != prog["id"], "Loaded program has wrong ID") - check_verifier_log(err, "[netdevsim] Hello from netdevsim!") start_test("Test XDP offload is device bound...") dfs = simdev.dfs_get_bound_progs(expected=1) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 22943b58d752..7d077d48cadd 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -149,15 +149,15 @@ void test__end_subtest() if (sub_error_cnt) env.fail_cnt++; - else + else if (test->skip_cnt == 0) env.sub_succ_cnt++; skip_account(); dump_test_log(test, sub_error_cnt); fprintf(env.stdout, "#%d/%d %s:%s\n", - test->test_num, test->subtest_num, - test->subtest_name, sub_error_cnt ? "FAIL" : "OK"); + test->test_num, test->subtest_num, test->subtest_name, + sub_error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK")); free(test->subtest_name); test->subtest_name = NULL; @@ -360,6 +360,58 @@ err: return -1; } +static int finit_module(int fd, const char *param_values, int flags) +{ + return syscall(__NR_finit_module, fd, param_values, flags); +} + +static int delete_module(const char *name, int flags) +{ + return syscall(__NR_delete_module, name, flags); +} + +static void unload_bpf_testmod(void) +{ + if (delete_module("bpf_testmod", 0)) { + if (errno == ENOENT) { + if (env.verbosity > VERBOSE_NONE) + fprintf(stdout, "bpf_testmod.ko is already unloaded.\n"); + return; + } + fprintf(env.stderr, "Failed to unload bpf_testmod.ko from kernel: %d\n", -errno); + exit(1); + } + if (env.verbosity > VERBOSE_NONE) + fprintf(stdout, "Successfully unloaded bpf_testmod.ko.\n"); +} + +static int load_bpf_testmod(void) +{ + int fd; + + /* ensure previous instance of the module is unloaded */ + unload_bpf_testmod(); + + if (env.verbosity > VERBOSE_NONE) + fprintf(stdout, "Loading bpf_testmod.ko...\n"); + + fd = open("bpf_testmod.ko", O_RDONLY); + if (fd < 0) { + fprintf(env.stderr, "Can't find bpf_testmod.ko kernel module: %d\n", -errno); + return -ENOENT; + } + if (finit_module(fd, "", 0)) { + fprintf(env.stderr, "Failed to load bpf_testmod.ko into the kernel: %d\n", -errno); + close(fd); + return -EINVAL; + } + close(fd); + + if (env.verbosity > VERBOSE_NONE) + fprintf(stdout, "Successfully loaded bpf_testmod.ko.\n"); + return 0; +} + /* extern declarations for test funcs */ #define DEFINE_TEST(name) extern void test_##name(void); #include <prog_tests/tests.h> @@ -535,6 +587,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return -EINVAL; } } + + if (env->verbosity > VERBOSE_NONE) { + if (setenv("SELFTESTS_VERBOSE", "1", 1) == -1) { + fprintf(stderr, + "Unable to setenv SELFTESTS_VERBOSE=1 (errno=%d)", + errno); + return -1; + } + } + break; case ARG_GET_TEST_CNT: env->get_test_cnt = true; @@ -678,6 +740,11 @@ int main(int argc, char **argv) save_netns(); stdio_hijack(); + env.has_testmod = true; + if (load_bpf_testmod()) { + fprintf(env.stderr, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n"); + env.has_testmod = false; + } for (i = 0; i < prog_test_cnt; i++) { struct prog_test_def *test = &prog_test_defs[i]; @@ -722,6 +789,8 @@ int main(int argc, char **argv) if (test->need_cgroup_cleanup) cleanup_cgroup_environment(); } + if (env.has_testmod) + unload_bpf_testmod(); stdio_restore(); if (env.get_test_cnt) { diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 238f5f61189e..115953243f62 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -66,6 +66,7 @@ struct test_env { enum verbosity verbosity; bool jit_enabled; + bool has_testmod; bool get_test_cnt; bool list_test_names; @@ -141,6 +142,17 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_NEQ(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act != ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld == expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + #define ASSERT_STREQ(actual, expected, name) ({ \ static int duration = 0; \ const char *___act = actual; \ diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index b8c72c1d9cf7..dcb83ab02919 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -31,6 +31,8 @@ #define CONNECT6_PROG_PATH "./connect6_prog.o" #define SENDMSG4_PROG_PATH "./sendmsg4_prog.o" #define SENDMSG6_PROG_PATH "./sendmsg6_prog.o" +#define BIND4_PROG_PATH "./bind4_prog.o" +#define BIND6_PROG_PATH "./bind6_prog.o" #define SERV4_IP "192.168.1.254" #define SERV4_REWRITE_IP "127.0.0.1" @@ -660,190 +662,6 @@ static int load_insns(const struct sock_addr_test *test, return ret; } -/* [1] These testing programs try to read different context fields, including - * narrow loads of different sizes from user_ip4 and user_ip6, and write to - * those allowed to be overridden. - * - * [2] BPF_LD_IMM64 & BPF_JMP_REG are used below whenever there is a need to - * compare a register with unsigned 32bit integer. BPF_JMP_IMM can't be used - * in such cases since it accepts only _signed_ 32bit integer as IMM - * argument. Also note that BPF_LD_IMM64 contains 2 instructions what matters - * to count jumps properly. - */ - -static int bind4_prog_load(const struct sock_addr_test *test) -{ - union { - uint8_t u4_addr8[4]; - uint16_t u4_addr16[2]; - uint32_t u4_addr32; - } ip4, port; - struct sockaddr_in addr4_rw; - - if (inet_pton(AF_INET, SERV4_IP, (void *)&ip4) != 1) { - log_err("Invalid IPv4: %s", SERV4_IP); - return -1; - } - - port.u4_addr32 = htons(SERV4_PORT); - - if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, - (struct sockaddr *)&addr4_rw, sizeof(addr4_rw)) == -1) - return -1; - - /* See [1]. */ - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 32), - - /* (sk.type == SOCK_DGRAM || sk.type == SOCK_STREAM) && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, type)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 1), - BPF_JMP_A(1), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 28), - - /* 1st_byte_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 26), - - /* 2nd_byte_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4) + 1), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 24), - - /* 3rd_byte_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4) + 2), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 22), - - /* 4th_byte_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4) + 3), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 20), - - /* 1st_half_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 18), - - /* 2nd_half_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4) + 2), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 16), - - /* whole_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4)), - BPF_LD_IMM64(BPF_REG_8, ip4.u4_addr32), /* See [2]. */ - BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 12), - - /* 1st_byte_of_user_port == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_port)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr8[0], 10), - - /* 1st_half_of_user_port == expected && */ - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_port)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr16[0], 8), - - /* user_port == expected) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_port)), - BPF_LD_IMM64(BPF_REG_8, port.u4_addr32), /* See [2]. */ - BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 4), - - /* user_ip4 = addr4_rw.sin_addr */ - BPF_MOV32_IMM(BPF_REG_7, addr4_rw.sin_addr.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_ip4)), - - /* user_port = addr4_rw.sin_port */ - BPF_MOV32_IMM(BPF_REG_7, addr4_rw.sin_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn)); -} - -static int bind6_prog_load(const struct sock_addr_test *test) -{ - struct sockaddr_in6 addr6_rw; - struct in6_addr ip6; - - if (inet_pton(AF_INET6, SERV6_IP, (void *)&ip6) != 1) { - log_err("Invalid IPv6: %s", SERV6_IP); - return -1; - } - - if (mk_sockaddr(AF_INET6, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, - (struct sockaddr *)&addr6_rw, sizeof(addr6_rw)) == -1) - return -1; - - /* See [1]. */ - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET6 && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18), - - /* 5th_byte_of_user_ip6 == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip6[1])), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip6.s6_addr[4], 16), - - /* 3rd_half_of_user_ip6 == expected && */ - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip6[1])), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip6.s6_addr16[2], 14), - - /* last_word_of_user_ip6 == expected) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip6[3])), - BPF_LD_IMM64(BPF_REG_8, ip6.s6_addr32[3]), /* See [2]. */ - BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 10), - - -#define STORE_IPV6_WORD(N) \ - BPF_MOV32_IMM(BPF_REG_7, addr6_rw.sin6_addr.s6_addr32[N]), \ - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, \ - offsetof(struct bpf_sock_addr, user_ip6[N])) - - /* user_ip6 = addr6_rw.sin6_addr */ - STORE_IPV6_WORD(0), - STORE_IPV6_WORD(1), - STORE_IPV6_WORD(2), - STORE_IPV6_WORD(3), - - /* user_port = addr6_rw.sin6_port */ - BPF_MOV32_IMM(BPF_REG_7, addr6_rw.sin6_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn)); -} - static int load_path(const struct sock_addr_test *test, const char *path) { struct bpf_prog_load_attr attr; @@ -865,6 +683,16 @@ static int load_path(const struct sock_addr_test *test, const char *path) return prog_fd; } +static int bind4_prog_load(const struct sock_addr_test *test) +{ + return load_path(test, BIND4_PROG_PATH); +} + +static int bind6_prog_load(const struct sock_addr_test *test) +{ + return load_path(test, BIND6_PROG_PATH); +} + static int connect4_prog_load(const struct sock_addr_test *test) { return load_path(test, CONNECT4_PROG_PATH); diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 0fa1e421c3d7..427ca00a3217 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1273,6 +1273,16 @@ static char *test_to_str(int test) return "unknown"; } +static void append_str(char *dst, const char *src, size_t dst_cap) +{ + size_t avail = dst_cap - strlen(dst); + + if (avail <= 1) /* just zero byte could be written */ + return; + + strncat(dst, src, avail - 1); /* strncat() adds + 1 for zero byte */ +} + #define OPTSTRING 60 static void test_options(char *options) { @@ -1281,42 +1291,42 @@ static void test_options(char *options) memset(options, 0, OPTSTRING); if (txmsg_pass) - strncat(options, "pass,", OPTSTRING); + append_str(options, "pass,", OPTSTRING); if (txmsg_redir) - strncat(options, "redir,", OPTSTRING); + append_str(options, "redir,", OPTSTRING); if (txmsg_drop) - strncat(options, "drop,", OPTSTRING); + append_str(options, "drop,", OPTSTRING); if (txmsg_apply) { snprintf(tstr, OPTSTRING, "apply %d,", txmsg_apply); - strncat(options, tstr, OPTSTRING); + append_str(options, tstr, OPTSTRING); } if (txmsg_cork) { snprintf(tstr, OPTSTRING, "cork %d,", txmsg_cork); - strncat(options, tstr, OPTSTRING); + append_str(options, tstr, OPTSTRING); } if (txmsg_start) { snprintf(tstr, OPTSTRING, "start %d,", txmsg_start); - strncat(options, tstr, OPTSTRING); + append_str(options, tstr, OPTSTRING); } if (txmsg_end) { snprintf(tstr, OPTSTRING, "end %d,", txmsg_end); - strncat(options, tstr, OPTSTRING); + append_str(options, tstr, OPTSTRING); } if (txmsg_start_pop) { snprintf(tstr, OPTSTRING, "pop (%d,%d),", txmsg_start_pop, txmsg_start_pop + txmsg_pop); - strncat(options, tstr, OPTSTRING); + append_str(options, tstr, OPTSTRING); } if (txmsg_ingress) - strncat(options, "ingress,", OPTSTRING); + append_str(options, "ingress,", OPTSTRING); if (txmsg_redir_skb) - strncat(options, "redir_skb,", OPTSTRING); + append_str(options, "redir_skb,", OPTSTRING); if (txmsg_ktls_skb) - strncat(options, "ktls_skb,", OPTSTRING); + append_str(options, "ktls_skb,", OPTSTRING); if (ktls) - strncat(options, "ktls,", OPTSTRING); + append_str(options, "ktls,", OPTSTRING); if (peek_flag) - strncat(options, "peek,", OPTSTRING); + append_str(options, "peek,", OPTSTRING); } static int __test_exec(int cgrp, int test, struct sockmap_options *opt) diff --git a/tools/testing/selftests/bpf/test_tcpbpf.h b/tools/testing/selftests/bpf/test_tcpbpf.h index 6220b95cbd02..9dd9b5590f9d 100644 --- a/tools/testing/selftests/bpf/test_tcpbpf.h +++ b/tools/testing/selftests/bpf/test_tcpbpf.h @@ -14,5 +14,9 @@ struct tcpbpf_globals { __u64 bytes_acked; __u32 num_listen; __u32 num_close_events; + __u32 tcp_save_syn; + __u32 tcp_saved_syn; + __u32 window_clamp_client; + __u32 window_clamp_server; }; #endif diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c deleted file mode 100644 index 74a9e49988b6..000000000000 --- a/tools/testing/selftests/bpf/test_tcpbpf_user.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <inttypes.h> -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <linux/bpf.h> -#include <sys/types.h> -#include <bpf/bpf.h> -#include <bpf/libbpf.h> - -#include "bpf_rlimit.h" -#include "bpf_util.h" -#include "cgroup_helpers.h" - -#include "test_tcpbpf.h" - -/* 3 comes from one listening socket + both ends of the connection */ -#define EXPECTED_CLOSE_EVENTS 3 - -#define EXPECT_EQ(expected, actual, fmt) \ - do { \ - if ((expected) != (actual)) { \ - printf(" Value of: " #actual "\n" \ - " Actual: %" fmt "\n" \ - " Expected: %" fmt "\n", \ - (actual), (expected)); \ - ret--; \ - } \ - } while (0) - -int verify_result(const struct tcpbpf_globals *result) -{ - __u32 expected_events; - int ret = 0; - - expected_events = ((1 << BPF_SOCK_OPS_TIMEOUT_INIT) | - (1 << BPF_SOCK_OPS_RWND_INIT) | - (1 << BPF_SOCK_OPS_TCP_CONNECT_CB) | - (1 << BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB) | - (1 << BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) | - (1 << BPF_SOCK_OPS_NEEDS_ECN) | - (1 << BPF_SOCK_OPS_STATE_CB) | - (1 << BPF_SOCK_OPS_TCP_LISTEN_CB)); - - EXPECT_EQ(expected_events, result->event_map, "#" PRIx32); - EXPECT_EQ(501ULL, result->bytes_received, "llu"); - EXPECT_EQ(1002ULL, result->bytes_acked, "llu"); - EXPECT_EQ(1, result->data_segs_in, PRIu32); - EXPECT_EQ(1, result->data_segs_out, PRIu32); - EXPECT_EQ(0x80, result->bad_cb_test_rv, PRIu32); - EXPECT_EQ(0, result->good_cb_test_rv, PRIu32); - EXPECT_EQ(1, result->num_listen, PRIu32); - EXPECT_EQ(EXPECTED_CLOSE_EVENTS, result->num_close_events, PRIu32); - - return ret; -} - -int verify_sockopt_result(int sock_map_fd) -{ - __u32 key = 0; - int ret = 0; - int res; - int rv; - - /* check setsockopt for SAVE_SYN */ - rv = bpf_map_lookup_elem(sock_map_fd, &key, &res); - EXPECT_EQ(0, rv, "d"); - EXPECT_EQ(0, res, "d"); - key = 1; - /* check getsockopt for SAVED_SYN */ - rv = bpf_map_lookup_elem(sock_map_fd, &key, &res); - EXPECT_EQ(0, rv, "d"); - EXPECT_EQ(1, res, "d"); - return ret; -} - -static int bpf_find_map(const char *test, struct bpf_object *obj, - const char *name) -{ - struct bpf_map *map; - - map = bpf_object__find_map_by_name(obj, name); - if (!map) { - printf("%s:FAIL:map '%s' not found\n", test, name); - return -1; - } - return bpf_map__fd(map); -} - -int main(int argc, char **argv) -{ - const char *file = "test_tcpbpf_kern.o"; - int prog_fd, map_fd, sock_map_fd; - struct tcpbpf_globals g = {0}; - const char *cg_path = "/foo"; - int error = EXIT_FAILURE; - struct bpf_object *obj; - int cg_fd = -1; - int retry = 10; - __u32 key = 0; - int rv; - - cg_fd = cgroup_setup_and_join(cg_path); - if (cg_fd < 0) - goto err; - - if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) { - printf("FAILED: load_bpf_file failed for: %s\n", file); - goto err; - } - - rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0); - if (rv) { - printf("FAILED: bpf_prog_attach: %d (%s)\n", - error, strerror(errno)); - goto err; - } - - if (system("./tcp_server.py")) { - printf("FAILED: TCP server\n"); - goto err; - } - - map_fd = bpf_find_map(__func__, obj, "global_map"); - if (map_fd < 0) - goto err; - - sock_map_fd = bpf_find_map(__func__, obj, "sockopt_results"); - if (sock_map_fd < 0) - goto err; - -retry_lookup: - rv = bpf_map_lookup_elem(map_fd, &key, &g); - if (rv != 0) { - printf("FAILED: bpf_map_lookup_elem returns %d\n", rv); - goto err; - } - - if (g.num_close_events != EXPECTED_CLOSE_EVENTS && retry--) { - printf("Unexpected number of close events (%d), retrying!\n", - g.num_close_events); - usleep(100); - goto retry_lookup; - } - - if (verify_result(&g)) { - printf("FAILED: Wrong stats\n"); - goto err; - } - - if (verify_sockopt_result(sock_map_fd)) { - printf("FAILED: Wrong sockopt stats\n"); - goto err; - } - - printf("PASSED!\n"); - error = 0; -err: - bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS); - close(cg_fd); - cleanup_cgroup_environment(); - return error; -} diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index bd12ec97a44d..1ccbe804e8e1 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -24,12 +24,12 @@ # Root namespace with metadata-mode tunnel + BPF # Device names and addresses: # veth1 IP: 172.16.1.200, IPv6: 00::22 (underlay) -# tunnel dev <type>11, ex: gre11, IPv4: 10.1.1.200 (overlay) +# tunnel dev <type>11, ex: gre11, IPv4: 10.1.1.200, IPv6: 1::22 (overlay) # # Namespace at_ns0 with native tunnel # Device names and addresses: # veth0 IPv4: 172.16.1.100, IPv6: 00::11 (underlay) -# tunnel dev <type>00, ex: gre00, IPv4: 10.1.1.100 (overlay) +# tunnel dev <type>00, ex: gre00, IPv4: 10.1.1.100, IPv6: 1::11 (overlay) # # # End-to-end ping packet flow @@ -250,7 +250,7 @@ add_ipip_tunnel() ip addr add dev $DEV 10.1.1.200/24 } -add_ipip6tnl_tunnel() +add_ip6tnl_tunnel() { ip netns exec at_ns0 ip addr add ::11/96 dev veth0 ip netns exec at_ns0 ip link set dev veth0 up @@ -262,11 +262,13 @@ add_ipip6tnl_tunnel() ip link add dev $DEV_NS type $TYPE \ local ::11 remote ::22 ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 + ip netns exec at_ns0 ip addr add dev $DEV_NS 1::11/96 ip netns exec at_ns0 ip link set dev $DEV_NS up # root namespace ip link add dev $DEV type $TYPE external ip addr add dev $DEV 10.1.1.200/24 + ip addr add dev $DEV 1::22/96 ip link set dev $DEV up } @@ -534,7 +536,7 @@ test_ipip6() check $TYPE config_device - add_ipip6tnl_tunnel + add_ip6tnl_tunnel ip link set dev veth1 mtu 1500 attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel # underlay @@ -553,6 +555,34 @@ test_ipip6() echo -e ${GREEN}"PASS: $TYPE"${NC} } +test_ip6ip6() +{ + TYPE=ip6tnl + DEV_NS=ip6ip6tnl00 + DEV=ip6ip6tnl11 + ret=0 + + check $TYPE + config_device + add_ip6tnl_tunnel + ip link set dev veth1 mtu 1500 + attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel + # underlay + ping6 $PING_ARG ::11 + # ip6 over ip6 + ping6 $PING_ARG 1::11 + check_err $? + ip netns exec at_ns0 ping6 $PING_ARG 1::22 + check_err $? + cleanup + + if [ $ret -ne 0 ]; then + echo -e ${RED}"FAIL: ip6$TYPE"${NC} + return 1 + fi + echo -e ${GREEN}"PASS: ip6$TYPE"${NC} +} + setup_xfrm_tunnel() { auth=0x$(printf '1%.0s' {1..40}) @@ -646,6 +676,7 @@ cleanup() ip link del veth1 2> /dev/null ip link del ipip11 2> /dev/null ip link del ipip6tnl11 2> /dev/null + ip link del ip6ip6tnl11 2> /dev/null ip link del gretap11 2> /dev/null ip link del ip6gre11 2> /dev/null ip link del ip6gretap11 2> /dev/null @@ -742,6 +773,10 @@ bpf_tunnel_test() test_ipip6 errors=$(( $errors + $? )) + echo "Testing IP6IP6 tunnel..." + test_ip6ip6 + errors=$(( $errors + $? )) + echo "Testing IPSec tunnel..." test_xfrm_tunnel errors=$(( $errors + $? )) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 9be395d9dc64..777a81404fdb 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -875,19 +875,36 @@ static int do_prog_test_run(int fd_prog, bool unpriv, uint32_t expected_val, __u8 tmp[TEST_DATA_LEN << 2]; __u32 size_tmp = sizeof(tmp); uint32_t retval; - int err; + int err, saved_errno; if (unpriv) set_admin(true); err = bpf_prog_test_run(fd_prog, 1, data, size_data, tmp, &size_tmp, &retval, NULL); + saved_errno = errno; + if (unpriv) set_admin(false); - if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) { - printf("Unexpected bpf_prog_test_run error "); - return err; + + if (err) { + switch (saved_errno) { + case 524/*ENOTSUPP*/: + printf("Did not run the program (not supported) "); + return 0; + case EPERM: + if (unpriv) { + printf("Did not run the program (no permission) "); + return 0; + } + /* fallthrough; */ + default: + printf("FAIL: Unexpected bpf_prog_test_run error (%s) ", + strerror(saved_errno)); + return err; + } } - if (!err && retval != expected_val && + + if (retval != expected_val && expected_val != POINTER_VALUE) { printf("FAIL retval %d != %d ", retval, expected_val); return 1; @@ -936,6 +953,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, int run_errs, run_successes; int map_fds[MAX_NR_MAPS]; const char *expected_err; + int saved_errno; int fixup_skips; __u32 pflags; int i, err; @@ -997,6 +1015,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, } fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog)); + saved_errno = errno; /* BPF_PROG_TYPE_TRACING requires more setup and * bpf_probe_prog_type won't give correct answer @@ -1013,7 +1032,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, if (expected_ret == ACCEPT || expected_ret == VERBOSE_ACCEPT) { if (fd_prog < 0) { printf("FAIL\nFailed to load prog '%s'!\n", - strerror(errno)); + strerror(saved_errno)); goto fail_log; } #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -1152,6 +1171,19 @@ static void get_unpriv_disabled() static bool test_as_unpriv(struct bpf_test *test) { +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + /* Some architectures have strict alignment requirements. In + * that case, the BPF verifier detects if a program has + * unaligned accesses and rejects them. A user can pass + * BPF_F_ANY_ALIGNMENT to a program to override this + * check. That, however, will only work when a privileged user + * loads a program. An unprivileged user loading a program + * with this flag will be rejected prior entering the + * verifier. + */ + if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS) + return false; +#endif return !test->prog_type || test->prog_type == BPF_PROG_TYPE_SOCKET_FILTER || test->prog_type == BPF_PROG_TYPE_CGROUP_SKB; diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh new file mode 100755 index 000000000000..88a7483eaae4 --- /dev/null +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -0,0 +1,259 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2020 Intel Corporation, Weqaar Janjua <weqaar.a.janjua@intel.com> + +# AF_XDP selftests based on veth +# +# End-to-end AF_XDP over Veth test +# +# Topology: +# --------- +# ----------- +# _ | Process | _ +# / ----------- \ +# / | \ +# / | \ +# ----------- | ----------- +# | Thread1 | | | Thread2 | +# ----------- | ----------- +# | | | +# ----------- | ----------- +# | xskX | | | xskY | +# ----------- | ----------- +# | | | +# ----------- | ---------- +# | vethX | --------- | vethY | +# ----------- peer ---------- +# | | | +# namespaceX | namespaceY +# +# AF_XDP is an address family optimized for high performance packet processing, +# it is XDP’s user-space interface. +# +# An AF_XDP socket is linked to a single UMEM which is a region of virtual +# contiguous memory, divided into equal-sized frames. +# +# Refer to AF_XDP Kernel Documentation for detailed information: +# https://www.kernel.org/doc/html/latest/networking/af_xdp.html +# +# Prerequisites setup by script: +# +# Set up veth interfaces as per the topology shown ^^: +# * setup two veth interfaces and one namespace +# ** veth<xxxx> in root namespace +# ** veth<yyyy> in af_xdp<xxxx> namespace +# ** namespace af_xdp<xxxx> +# * create a spec file veth.spec that includes this run-time configuration +# *** xxxx and yyyy are randomly generated 4 digit numbers used to avoid +# conflict with any existing interface +# * tests the veth and xsk layers of the topology +# +# See the source xdpxceiver.c for information on each test +# +# Kernel configuration: +# --------------------- +# See "config" file for recommended kernel config options. +# +# Turn on XDP sockets and veth support when compiling i.e. +# Networking support --> +# Networking options --> +# [ * ] XDP sockets +# +# Executing Tests: +# ---------------- +# Must run with CAP_NET_ADMIN capability. +# +# Run (full color-coded output): +# sudo ./test_xsk.sh -c +# +# If running from kselftests: +# sudo make colorconsole=1 run_tests +# +# Run (full output without color-coding): +# sudo ./test_xsk.sh + +. xsk_prereqs.sh + +while getopts c flag +do + case "${flag}" in + c) colorconsole=1;; + esac +done + +TEST_NAME="PREREQUISITES" + +URANDOM=/dev/urandom +[ ! -e "${URANDOM}" ] && { echo "${URANDOM} not found. Skipping tests."; test_exit 1 1; } + +VETH0_POSTFIX=$(cat ${URANDOM} | tr -dc '0-9' | fold -w 256 | head -n 1 | head --bytes 4) +VETH0=ve${VETH0_POSTFIX} +VETH1_POSTFIX=$(cat ${URANDOM} | tr -dc '0-9' | fold -w 256 | head -n 1 | head --bytes 4) +VETH1=ve${VETH1_POSTFIX} +NS0=root +NS1=af_xdp${VETH1_POSTFIX} +MTU=1500 + +setup_vethPairs() { + echo "setting up ${VETH0}: namespace: ${NS0}" + ip netns add ${NS1} + ip link add ${VETH0} type veth peer name ${VETH1} + if [ -f /proc/net/if_inet6 ]; then + echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6 + fi + echo "setting up ${VETH1}: namespace: ${NS1}" + ip link set ${VETH1} netns ${NS1} + ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU} + ip link set ${VETH0} mtu ${MTU} + ip netns exec ${NS1} ip link set ${VETH1} up + ip link set ${VETH0} up +} + +validate_root_exec +validate_veth_support ${VETH0} +validate_ip_utility +setup_vethPairs + +retval=$? +if [ $retval -ne 0 ]; then + test_status $retval "${TEST_NAME}" + cleanup_exit ${VETH0} ${VETH1} ${NS1} + exit $retval +fi + +echo "${VETH0}:${VETH1},${NS1}" > ${SPECFILE} + +validate_veth_spec_file + +echo "Spec file created: ${SPECFILE}" + +test_status $retval "${TEST_NAME}" + +## START TESTS + +statusList=() + +### TEST 1 +TEST_NAME="XSK KSELFTEST FRAMEWORK" + +echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode" +vethXDPgeneric ${VETH0} ${VETH1} ${NS1} + +retval=$? +if [ $retval -eq 0 ]; then + echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode" + vethXDPnative ${VETH0} ${VETH1} ${NS1} +fi + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 2 +TEST_NAME="SKB NOPOLL" + +vethXDPgeneric ${VETH0} ${VETH1} ${NS1} + +params=("-S") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 3 +TEST_NAME="SKB POLL" + +vethXDPgeneric ${VETH0} ${VETH1} ${NS1} + +params=("-S" "-p") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 4 +TEST_NAME="DRV NOPOLL" + +vethXDPnative ${VETH0} ${VETH1} ${NS1} + +params=("-N") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 5 +TEST_NAME="DRV POLL" + +vethXDPnative ${VETH0} ${VETH1} ${NS1} + +params=("-N" "-p") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 6 +TEST_NAME="SKB SOCKET TEARDOWN" + +vethXDPgeneric ${VETH0} ${VETH1} ${NS1} + +params=("-S" "-T") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 7 +TEST_NAME="DRV SOCKET TEARDOWN" + +vethXDPnative ${VETH0} ${VETH1} ${NS1} + +params=("-N" "-T") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 8 +TEST_NAME="SKB BIDIRECTIONAL SOCKETS" + +vethXDPgeneric ${VETH0} ${VETH1} ${NS1} + +params=("-S" "-B") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 9 +TEST_NAME="DRV BIDIRECTIONAL SOCKETS" + +vethXDPnative ${VETH0} ${VETH1} ${NS1} + +params=("-N" "-B") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +## END TESTS + +cleanup_exit ${VETH0} ${VETH1} ${NS1} + +for _status in "${statusList[@]}" +do + if [ $_status -ne 0 ]; then + test_exit $ksft_fail 0 + fi +done + +test_exit $ksft_pass 0 diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index 1c4b1939f5a8..bed53b561e04 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -68,7 +68,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), - BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 0xffffffff, 1), + BPF_JMP32_IMM(BPF_JSGT, BPF_REG_1, 0xffffffff, 1), BPF_MOV32_IMM(BPF_REG_1, 0), BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES), BPF_JMP_REG(BPF_JSGT, BPF_REG_2, BPF_REG_1, 1), diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index dac40de3f868..57ed67b86074 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -703,3 +703,44 @@ .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, +{ + "bounds checks after 32-bit truncation. test 1", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + /* This used to reduce the max bound to 0x7fffffff */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 0x7fffffff, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 3 }, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, +}, +{ + "bounds checks after 32-bit truncation. test 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLT, BPF_REG_1, 1, 1), + BPF_JMP32_IMM(BPF_JSLT, BPF_REG_1, 0, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 3 }, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c index 2ad5f974451c..fb13ca2d5606 100644 --- a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c +++ b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c @@ -266,6 +266,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "invalid 8-byte read from bpf_sk_lookup remote_ip4 field", @@ -292,6 +293,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "invalid 8-byte read from bpf_sk_lookup remote_port field", @@ -305,6 +307,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "invalid 8-byte read from bpf_sk_lookup local_ip4 field", @@ -331,6 +334,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "invalid 8-byte read from bpf_sk_lookup local_port field", @@ -344,6 +348,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, /* invalid 1,2,4-byte reads from 8-byte fields in bpf_sk_lookup */ { @@ -410,6 +415,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "invalid 4-byte unaligned read from bpf_sk_lookup at even offset", @@ -422,6 +428,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, /* in-bound and out-of-bound writes to bpf_sk_lookup */ { diff --git a/tools/testing/selftests/bpf/verifier/ctx_skb.c b/tools/testing/selftests/bpf/verifier/ctx_skb.c index 2e16b8e268f2..2022c0f2cd75 100644 --- a/tools/testing/selftests/bpf/verifier/ctx_skb.c +++ b/tools/testing/selftests/bpf/verifier/ctx_skb.c @@ -1089,3 +1089,45 @@ .errstr_unpriv = "R1 leaks addr", .result = REJECT, }, +{ + "pkt > pkt_end taken check", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, // 0. r2 = *(u32 *)(r1 + data_end) + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, // 1. r4 = *(u32 *)(r1 + data) + offsetof(struct __sk_buff, data)), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_4), // 2. r3 = r4 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 42), // 3. r3 += 42 + BPF_MOV64_IMM(BPF_REG_1, 0), // 4. r1 = 0 + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_2, 2), // 5. if r3 > r2 goto 8 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 14), // 6. r4 += 14 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_4), // 7. r1 = r4 + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_2, 1), // 8. if r3 > r2 goto 10 + BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1, 9), // 9. r2 = *(u8 *)(r1 + 9) + BPF_MOV64_IMM(BPF_REG_0, 0), // 10. r0 = 0 + BPF_EXIT_INSN(), // 11. exit + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_SKB, +}, +{ + "pkt_end < pkt taken check", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, // 0. r2 = *(u32 *)(r1 + data_end) + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, // 1. r4 = *(u32 *)(r1 + data) + offsetof(struct __sk_buff, data)), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_4), // 2. r3 = r4 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 42), // 3. r3 += 42 + BPF_MOV64_IMM(BPF_REG_1, 0), // 4. r1 = 0 + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_2, 2), // 5. if r3 > r2 goto 8 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 14), // 6. r4 += 14 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_4), // 7. r1 = r4 + BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, 1), // 8. if r2 < r3 goto 10 + BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1, 9), // 9. r2 = *(u8 *)(r1 + 9) + BPF_MOV64_IMM(BPF_REG_0, 0), // 10. r0 = 0 + BPF_EXIT_INSN(), // 11. exit + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_SKB, +}, diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index 988f46a1a4c7..c0648dc009b5 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -69,6 +69,7 @@ .fixup_map_array_48b = { 1 }, .result = REJECT, .errstr = "R1 min value is outside of the allowed memory range", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "direct map access, write test 7", @@ -195,6 +196,7 @@ .fixup_map_array_48b = { 1, 3 }, .result = REJECT, .errstr = "invalid access to map value, value_size=48 off=47 size=2", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "direct map access, write test 17", @@ -209,6 +211,7 @@ .fixup_map_array_48b = { 1, 3 }, .result = REJECT, .errstr = "invalid access to map value, value_size=48 off=47 size=2", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "direct map access, write test 18", diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c index 637f9293bda8..b117bdd3806d 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr.c @@ -44,6 +44,7 @@ .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", .result = REJECT, .errstr = "cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "bpf_map_ptr: read ops field accepted", diff --git a/tools/testing/selftests/bpf/verifier/raw_tp_writable.c b/tools/testing/selftests/bpf/verifier/raw_tp_writable.c index 95b5d70a1dc1..2978fb5a769d 100644 --- a/tools/testing/selftests/bpf/verifier/raw_tp_writable.c +++ b/tools/testing/selftests/bpf/verifier/raw_tp_writable.c @@ -31,4 +31,5 @@ .fixup_map_hash_8b = { 1, }, .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, .errstr = "R6 invalid variable buffer offset: off=0, var_off=(0x0; 0xffffffff)", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 006b5bd99c08..3b6ee009c00b 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -675,6 +675,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .errstr = "invalid mem access", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "reference tracking: use ptr from bpf_sk_fullsock() after release", @@ -698,6 +699,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .errstr = "invalid mem access", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "reference tracking: use ptr from bpf_sk_fullsock(tp) after release", @@ -725,6 +727,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .errstr = "invalid mem access", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "reference tracking: use sk after bpf_sk_release(tp)", @@ -747,6 +750,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .errstr = "invalid mem access", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "reference tracking: use ptr from bpf_get_listener_sock() after bpf_sk_release(sk)", diff --git a/tools/testing/selftests/bpf/verifier/regalloc.c b/tools/testing/selftests/bpf/verifier/regalloc.c index 4ad7e05de706..bb0dd89dd212 100644 --- a/tools/testing/selftests/bpf/verifier/regalloc.c +++ b/tools/testing/selftests/bpf/verifier/regalloc.c @@ -21,6 +21,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc negative", @@ -71,6 +72,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc src_reg negative", @@ -97,6 +99,7 @@ .result = REJECT, .errstr = "invalid access to map value, value_size=48 off=44 size=8", .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc and spill", @@ -126,6 +129,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc and spill negative", @@ -156,6 +160,7 @@ .result = REJECT, .errstr = "invalid access to map value, value_size=48 off=48 size=8", .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc three regs", @@ -182,6 +187,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc after call", @@ -210,6 +216,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc in callee", @@ -240,6 +247,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc, spill, JEQ", diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index 91bb77c24a2e..a3fe0fbaed41 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -108,8 +108,9 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "invalid indirect read from stack off -8+0 size 8", - .result = REJECT, + .errstr_unpriv = "invalid indirect read from stack off -8+0 size 8", + .result_unpriv = REJECT, + .result = ACCEPT, }, { "unpriv: mangle pointer on stack 1", diff --git a/tools/testing/selftests/bpf/verifier/wide_access.c b/tools/testing/selftests/bpf/verifier/wide_access.c index ccade9312d21..55af248efa93 100644 --- a/tools/testing/selftests/bpf/verifier/wide_access.c +++ b/tools/testing/selftests/bpf/verifier/wide_access.c @@ -1,4 +1,4 @@ -#define BPF_SOCK_ADDR_STORE(field, off, res, err) \ +#define BPF_SOCK_ADDR_STORE(field, off, res, err, flgs) \ { \ "wide store to bpf_sock_addr." #field "[" #off "]", \ .insns = { \ @@ -11,31 +11,36 @@ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ .errstr = err, \ + .flags = flgs, \ } /* user_ip6[0] is u64 aligned */ BPF_SOCK_ADDR_STORE(user_ip6, 0, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_STORE(user_ip6, 1, REJECT, - "invalid bpf_context access off=12 size=8"), + "invalid bpf_context access off=12 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_STORE(user_ip6, 2, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_STORE(user_ip6, 3, REJECT, - "invalid bpf_context access off=20 size=8"), + "invalid bpf_context access off=20 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), /* msg_src_ip6[0] is _not_ u64 aligned */ BPF_SOCK_ADDR_STORE(msg_src_ip6, 0, REJECT, - "invalid bpf_context access off=44 size=8"), + "invalid bpf_context access off=44 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_STORE(msg_src_ip6, 1, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_STORE(msg_src_ip6, 2, REJECT, - "invalid bpf_context access off=52 size=8"), + "invalid bpf_context access off=52 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT, - "invalid bpf_context access off=56 size=8"), + "invalid bpf_context access off=56 size=8", 0), #undef BPF_SOCK_ADDR_STORE -#define BPF_SOCK_ADDR_LOAD(field, off, res, err) \ +#define BPF_SOCK_ADDR_LOAD(field, off, res, err, flgs) \ { \ "wide load from bpf_sock_addr." #field "[" #off "]", \ .insns = { \ @@ -48,26 +53,31 @@ BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT, .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ .errstr = err, \ + .flags = flgs, \ } /* user_ip6[0] is u64 aligned */ BPF_SOCK_ADDR_LOAD(user_ip6, 0, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_LOAD(user_ip6, 1, REJECT, - "invalid bpf_context access off=12 size=8"), + "invalid bpf_context access off=12 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_LOAD(user_ip6, 2, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_LOAD(user_ip6, 3, REJECT, - "invalid bpf_context access off=20 size=8"), + "invalid bpf_context access off=20 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), /* msg_src_ip6[0] is _not_ u64 aligned */ BPF_SOCK_ADDR_LOAD(msg_src_ip6, 0, REJECT, - "invalid bpf_context access off=44 size=8"), + "invalid bpf_context access off=44 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_LOAD(msg_src_ip6, 1, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_LOAD(msg_src_ip6, 2, REJECT, - "invalid bpf_context access off=52 size=8"), + "invalid bpf_context access off=52 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_LOAD(msg_src_ip6, 3, REJECT, - "invalid bpf_context access off=56 size=8"), + "invalid bpf_context access off=56 size=8", 0), #undef BPF_SOCK_ADDR_LOAD diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c new file mode 100644 index 000000000000..014dedaa4dd2 --- /dev/null +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -0,0 +1,1074 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2020 Intel Corporation. */ + +/* + * Some functions in this program are taken from + * Linux kernel samples/bpf/xdpsock* and modified + * for use. + * + * See test_xsk.sh for detailed information on test topology + * and prerequisite network setup. + * + * This test program contains two threads, each thread is single socket with + * a unique UMEM. It validates in-order packet delivery and packet content + * by sending packets to each other. + * + * Tests Information: + * ------------------ + * These selftests test AF_XDP SKB and Native/DRV modes using veth + * Virtual Ethernet interfaces. + * + * The following tests are run: + * + * 1. AF_XDP SKB mode + * Generic mode XDP is driver independent, used when the driver does + * not have support for XDP. Works on any netdevice using sockets and + * generic XDP path. XDP hook from netif_receive_skb(). + * a. nopoll - soft-irq processing + * b. poll - using poll() syscall + * c. Socket Teardown + * Create a Tx and a Rx socket, Tx from one socket, Rx on another. Destroy + * both sockets, then repeat multiple times. Only nopoll mode is used + * d. Bi-directional sockets + * Configure sockets as bi-directional tx/rx sockets, sets up fill and + * completion rings on each socket, tx/rx in both directions. Only nopoll + * mode is used + * + * 2. AF_XDP DRV/Native mode + * Works on any netdevice with XDP_REDIRECT support, driver dependent. Processes + * packets before SKB allocation. Provides better performance than SKB. Driver + * hook available just after DMA of buffer descriptor. + * a. nopoll + * b. poll + * c. Socket Teardown + * d. Bi-directional sockets + * - Only copy mode is supported because veth does not currently support + * zero-copy mode + * + * Total tests: 8 + * + * Flow: + * ----- + * - Single process spawns two threads: Tx and Rx + * - Each of these two threads attach to a veth interface within their assigned + * namespaces + * - Each thread Creates one AF_XDP socket connected to a unique umem for each + * veth interface + * - Tx thread Transmits 10k packets from veth<xxxx> to veth<yyyy> + * - Rx thread verifies if all 10k packets were received and delivered in-order, + * and have the right content + * + * Enable/disable debug mode: + * -------------------------- + * To enable L2 - L4 headers and payload dump of each packet on STDOUT, add + * parameter -D to params array in test_xsk.sh, i.e. params=("-S" "-D") + */ + +#define _GNU_SOURCE +#include <fcntl.h> +#include <errno.h> +#include <getopt.h> +#include <asm/barrier.h> +typedef __u16 __sum16; +#include <linux/if_link.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <arpa/inet.h> +#include <net/if.h> +#include <locale.h> +#include <poll.h> +#include <pthread.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stddef.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <time.h> +#include <unistd.h> +#include <stdatomic.h> +#include <bpf/xsk.h> +#include "xdpxceiver.h" +#include "../kselftest.h" + +static void __exit_with_error(int error, const char *file, const char *func, int line) +{ + ksft_test_result_fail + ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); + ksft_exit_xfail(); +} + +#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) + +#define print_ksft_result(void)\ + (ksft_test_result_pass("PASS: %s %s %s%s\n", uut ? "DRV" : "SKB", opt_poll ? "POLL" :\ + "NOPOLL", opt_teardown ? "Socket Teardown" : "",\ + opt_bidi ? "Bi-directional Sockets" : "")) + +static void pthread_init_mutex(void) +{ + pthread_mutex_init(&sync_mutex, NULL); + pthread_mutex_init(&sync_mutex_tx, NULL); + pthread_cond_init(&signal_rx_condition, NULL); + pthread_cond_init(&signal_tx_condition, NULL); +} + +static void pthread_destroy_mutex(void) +{ + pthread_mutex_destroy(&sync_mutex); + pthread_mutex_destroy(&sync_mutex_tx); + pthread_cond_destroy(&signal_rx_condition); + pthread_cond_destroy(&signal_tx_condition); +} + +static void *memset32_htonl(void *dest, u32 val, u32 size) +{ + u32 *ptr = (u32 *)dest; + int i; + + val = htonl(val); + + for (i = 0; i < (size & (~0x3)); i += 4) + ptr[i >> 2] = val; + + for (; i < size; i++) + ((char *)dest)[i] = ((char *)&val)[i & 3]; + + return dest; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static inline unsigned short from32to16(unsigned int x) +{ + /* add up 16-bit and 16-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +/* + * Fold a partial checksum + * This function code has been taken from + * Linux kernel include/asm-generic/checksum.h + */ +static inline __u16 csum_fold(__u32 csum) +{ + u32 sum = (__force u32)csum; + + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return (__force __u16)~sum; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static inline u32 from64to32(u64 x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + return (u32)x; +} + +__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum); + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +{ + unsigned long long s = (__force u32)sum; + + s += (__force u32)saddr; + s += (__force u32)daddr; +#ifdef __BIG_ENDIAN__ + s += proto + len; +#else + s += (proto + len) << 8; +#endif + return (__force __u32)from64to32(s); +} + +/* + * This function has been taken from + * Linux kernel include/asm-generic/checksum.h + */ +static inline __u16 +csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); +} + +static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) +{ + u32 csum = 0; + u32 cnt = 0; + + /* udp hdr and data */ + for (; cnt < len; cnt += 2) + csum += udp_pkt[cnt >> 1]; + + return csum_tcpudp_magic(saddr, daddr, len, proto, csum); +} + +static void gen_eth_hdr(void *data, struct ethhdr *eth_hdr) +{ + memcpy(eth_hdr->h_dest, ((struct ifobject *)data)->dst_mac, ETH_ALEN); + memcpy(eth_hdr->h_source, ((struct ifobject *)data)->src_mac, ETH_ALEN); + eth_hdr->h_proto = htons(ETH_P_IP); +} + +static void gen_ip_hdr(void *data, struct iphdr *ip_hdr) +{ + ip_hdr->version = IP_PKT_VER; + ip_hdr->ihl = 0x5; + ip_hdr->tos = IP_PKT_TOS; + ip_hdr->tot_len = htons(IP_PKT_SIZE); + ip_hdr->id = 0; + ip_hdr->frag_off = 0; + ip_hdr->ttl = IPDEFTTL; + ip_hdr->protocol = IPPROTO_UDP; + ip_hdr->saddr = ((struct ifobject *)data)->src_ip; + ip_hdr->daddr = ((struct ifobject *)data)->dst_ip; + ip_hdr->check = 0; +} + +static void gen_udp_hdr(void *data, void *arg, struct udphdr *udp_hdr) +{ + udp_hdr->source = htons(((struct ifobject *)arg)->src_port); + udp_hdr->dest = htons(((struct ifobject *)arg)->dst_port); + udp_hdr->len = htons(UDP_PKT_SIZE); + memset32_htonl(pkt_data + PKT_HDR_SIZE, + htonl(((struct generic_data *)data)->seqnum), UDP_PKT_DATA_SIZE); +} + +static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr) +{ + udp_hdr->check = 0; + udp_hdr->check = + udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, IPPROTO_UDP, (u16 *)udp_hdr); +} + +static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) +{ + memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE); +} + +static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) +{ + int ret; + + data->umem = calloc(1, sizeof(struct xsk_umem_info)); + if (!data->umem) + exit_with_error(errno); + + ret = xsk_umem__create(&data->umem->umem, buffer, size, + &data->umem->fq, &data->umem->cq, NULL); + if (ret) + exit_with_error(ret); + + data->umem->buffer = buffer; +} + +static void xsk_populate_fill_ring(struct xsk_umem_info *umem) +{ + int ret, i; + u32 idx; + + ret = xsk_ring_prod__reserve(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx); + if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) + exit_with_error(ret); + for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++) + *xsk_ring_prod__fill_addr(&umem->fq, idx++) = i * XSK_UMEM__DEFAULT_FRAME_SIZE; + xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); +} + +static int xsk_configure_socket(struct ifobject *ifobject) +{ + struct xsk_socket_config cfg; + struct xsk_ring_cons *rxr; + struct xsk_ring_prod *txr; + int ret; + + ifobject->xsk = calloc(1, sizeof(struct xsk_socket_info)); + if (!ifobject->xsk) + exit_with_error(errno); + + ifobject->xsk->umem = ifobject->umem; + cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg.libbpf_flags = 0; + cfg.xdp_flags = opt_xdp_flags; + cfg.bind_flags = opt_xdp_bind_flags; + + if (!opt_bidi) { + rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL; + txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL; + } else { + rxr = &ifobject->xsk->rx; + txr = &ifobject->xsk->tx; + } + + ret = xsk_socket__create(&ifobject->xsk->xsk, ifobject->ifname, + opt_queue, ifobject->umem->umem, rxr, txr, &cfg); + + if (ret) + return 1; + + return 0; +} + +static struct option long_options[] = { + {"interface", required_argument, 0, 'i'}, + {"queue", optional_argument, 0, 'q'}, + {"poll", no_argument, 0, 'p'}, + {"xdp-skb", no_argument, 0, 'S'}, + {"xdp-native", no_argument, 0, 'N'}, + {"copy", no_argument, 0, 'c'}, + {"tear-down", no_argument, 0, 'T'}, + {"bidi", optional_argument, 0, 'B'}, + {"debug", optional_argument, 0, 'D'}, + {"tx-pkt-count", optional_argument, 0, 'C'}, + {0, 0, 0, 0} +}; + +static void usage(const char *prog) +{ + const char *str = + " Usage: %s [OPTIONS]\n" + " Options:\n" + " -i, --interface Use interface\n" + " -q, --queue=n Use queue n (default 0)\n" + " -p, --poll Use poll syscall\n" + " -S, --xdp-skb=n Use XDP SKB mode\n" + " -N, --xdp-native=n Enforce XDP DRV (native) mode\n" + " -c, --copy Force copy mode\n" + " -T, --tear-down Tear down sockets by repeatedly recreating them\n" + " -B, --bidi Bi-directional sockets test\n" + " -D, --debug Debug mode - dump packets L2 - L5\n" + " -C, --tx-pkt-count=n Number of packets to send\n"; + ksft_print_msg(str, prog); +} + +static bool switch_namespace(int idx) +{ + char fqns[26] = "/var/run/netns/"; + int nsfd; + + strncat(fqns, ifdict[idx]->nsname, sizeof(fqns) - strlen(fqns) - 1); + nsfd = open(fqns, O_RDONLY); + + if (nsfd == -1) + exit_with_error(errno); + + if (setns(nsfd, 0) == -1) + exit_with_error(errno); + + return true; +} + +static void *nsswitchthread(void *args) +{ + if (switch_namespace(((struct targs *)args)->idx)) { + ifdict[((struct targs *)args)->idx]->ifindex = + if_nametoindex(ifdict[((struct targs *)args)->idx]->ifname); + if (!ifdict[((struct targs *)args)->idx]->ifindex) { + ksft_test_result_fail + ("ERROR: [%s] interface \"%s\" does not exist\n", + __func__, ifdict[((struct targs *)args)->idx]->ifname); + ((struct targs *)args)->retptr = false; + } else { + ksft_print_msg("Interface found: %s\n", + ifdict[((struct targs *)args)->idx]->ifname); + ((struct targs *)args)->retptr = true; + } + } else { + ((struct targs *)args)->retptr = false; + } + pthread_exit(NULL); +} + +static int validate_interfaces(void) +{ + bool ret = true; + + for (int i = 0; i < MAX_INTERFACES; i++) { + if (!strcmp(ifdict[i]->ifname, "")) { + ret = false; + ksft_test_result_fail("ERROR: interfaces: -i <int>,<ns> -i <int>,<ns>."); + } + if (strcmp(ifdict[i]->nsname, "")) { + struct targs *targs; + + targs = (struct targs *)malloc(sizeof(struct targs)); + if (!targs) + exit_with_error(errno); + + targs->idx = i; + if (pthread_create(&ns_thread, NULL, nsswitchthread, (void *)targs)) + exit_with_error(errno); + + pthread_join(ns_thread, NULL); + + if (targs->retptr) + ksft_print_msg("NS switched: %s\n", ifdict[i]->nsname); + + free(targs); + } else { + ifdict[i]->ifindex = if_nametoindex(ifdict[i]->ifname); + if (!ifdict[i]->ifindex) { + ksft_test_result_fail + ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname); + ret = false; + } else { + ksft_print_msg("Interface found: %s\n", ifdict[i]->ifname); + } + } + } + return ret; +} + +static void parse_command_line(int argc, char **argv) +{ + int option_index, interface_index = 0, c; + + opterr = 0; + + for (;;) { + c = getopt_long(argc, argv, "i:q:pSNcTBDC:", long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 'i': + if (interface_index == MAX_INTERFACES) + break; + char *sptr, *token; + + sptr = strndupa(optarg, strlen(optarg)); + memcpy(ifdict[interface_index]->ifname, + strsep(&sptr, ","), MAX_INTERFACE_NAME_CHARS); + token = strsep(&sptr, ","); + if (token) + memcpy(ifdict[interface_index]->nsname, token, + MAX_INTERFACES_NAMESPACE_CHARS); + interface_index++; + break; + case 'q': + opt_queue = atoi(optarg); + break; + case 'p': + opt_poll = 1; + break; + case 'S': + opt_xdp_flags |= XDP_FLAGS_SKB_MODE; + opt_xdp_bind_flags |= XDP_COPY; + uut = ORDER_CONTENT_VALIDATE_XDP_SKB; + break; + case 'N': + opt_xdp_flags |= XDP_FLAGS_DRV_MODE; + opt_xdp_bind_flags |= XDP_COPY; + uut = ORDER_CONTENT_VALIDATE_XDP_DRV; + break; + case 'c': + opt_xdp_bind_flags |= XDP_COPY; + break; + case 'T': + opt_teardown = 1; + break; + case 'B': + opt_bidi = 1; + break; + case 'D': + debug_pkt_dump = 1; + break; + case 'C': + opt_pkt_count = atoi(optarg); + break; + default: + usage(basename(argv[0])); + ksft_exit_xfail(); + } + } + + if (!validate_interfaces()) { + usage(basename(argv[0])); + ksft_exit_xfail(); + } +} + +static void kick_tx(struct xsk_socket_info *xsk) +{ + int ret; + + ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) + return; + exit_with_error(errno); +} + +static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) +{ + unsigned int rcvd; + u32 idx; + + if (!xsk->outstanding_tx) + return; + + if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); + + rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); + if (rcvd) { + xsk_ring_cons__release(&xsk->umem->cq, rcvd); + xsk->outstanding_tx -= rcvd; + xsk->tx_npkts += rcvd; + } +} + +static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) +{ + unsigned int rcvd, i; + u32 idx_rx = 0, idx_fq = 0; + int ret; + + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + if (!rcvd) { + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(ret); + } + return; + } + + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(ret); + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(ret); + } + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + } + + for (i = 0; i < rcvd; i++) { + u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; + (void)xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; + u64 orig = xsk_umem__extract_addr(addr); + + addr = xsk_umem__add_offset_to_addr(addr); + pkt_node_rx = malloc(sizeof(struct pkt) + PKT_SIZE); + if (!pkt_node_rx) + exit_with_error(errno); + + pkt_node_rx->pkt_frame = (char *)malloc(PKT_SIZE); + if (!pkt_node_rx->pkt_frame) + exit_with_error(errno); + + memcpy(pkt_node_rx->pkt_frame, xsk_umem__get_data(xsk->umem->buffer, addr), + PKT_SIZE); + + TAILQ_INSERT_HEAD(&head, pkt_node_rx, pkt_nodes); + + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; + } + + xsk_ring_prod__submit(&xsk->umem->fq, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); + xsk->rx_npkts += rcvd; +} + +static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) +{ + u32 idx; + unsigned int i; + + while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size) + complete_tx_only(xsk, batch_size); + + for (i = 0; i < batch_size; i++) { + struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); + + tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; + tx_desc->len = PKT_SIZE; + } + + xsk_ring_prod__submit(&xsk->tx, batch_size); + xsk->outstanding_tx += batch_size; + *frameptr += batch_size; + *frameptr %= num_frames; + complete_tx_only(xsk, batch_size); +} + +static inline int get_batch_size(int pkt_cnt) +{ + if (!opt_pkt_count) + return BATCH_SIZE; + + if (pkt_cnt + BATCH_SIZE <= opt_pkt_count) + return BATCH_SIZE; + + return opt_pkt_count - pkt_cnt; +} + +static void complete_tx_only_all(void *arg) +{ + bool pending; + + do { + pending = false; + if (((struct ifobject *)arg)->xsk->outstanding_tx) { + complete_tx_only(((struct ifobject *) + arg)->xsk, BATCH_SIZE); + pending = !!((struct ifobject *)arg)->xsk->outstanding_tx; + } + } while (pending); +} + +static void tx_only_all(void *arg) +{ + struct pollfd fds[MAX_SOCKS] = { }; + u32 frame_nb = 0; + int pkt_cnt = 0; + int ret; + + fds[0].fd = xsk_socket__fd(((struct ifobject *)arg)->xsk->xsk); + fds[0].events = POLLOUT; + + while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { + int batch_size = get_batch_size(pkt_cnt); + + if (opt_poll) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret <= 0) + continue; + + if (!(fds[0].revents & POLLOUT)) + continue; + } + + tx_only(((struct ifobject *)arg)->xsk, &frame_nb, batch_size); + pkt_cnt += batch_size; + } + + if (opt_pkt_count) + complete_tx_only_all(arg); +} + +static void worker_pkt_dump(void) +{ + struct in_addr ipaddr; + + fprintf(stdout, "---------------------------------------\n"); + for (int iter = 0; iter < num_frames - 1; iter++) { + /*extract L2 frame */ + fprintf(stdout, "DEBUG>> L2: dst mac: "); + for (int i = 0; i < ETH_ALEN; i++) + fprintf(stdout, "%02X", ((struct ethhdr *) + pkt_buf[iter]->payload)->h_dest[i]); + + fprintf(stdout, "\nDEBUG>> L2: src mac: "); + for (int i = 0; i < ETH_ALEN; i++) + fprintf(stdout, "%02X", ((struct ethhdr *) + pkt_buf[iter]->payload)->h_source[i]); + + /*extract L3 frame */ + fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", + ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->ihl); + + ipaddr.s_addr = + ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->saddr; + fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", inet_ntoa(ipaddr)); + + ipaddr.s_addr = + ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->daddr; + fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", inet_ntoa(ipaddr)); + + /*extract L4 frame */ + fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", + ntohs(((struct udphdr *)(pkt_buf[iter]->payload + + sizeof(struct ethhdr) + + sizeof(struct iphdr)))->source)); + + fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", + ntohs(((struct udphdr *)(pkt_buf[iter]->payload + + sizeof(struct ethhdr) + + sizeof(struct iphdr)))->dest)); + /*extract L5 frame */ + int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); + + if (payload == EOT) { + ksft_print_msg("End-of-tranmission frame received\n"); + fprintf(stdout, "---------------------------------------\n"); + break; + } + fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload); + fprintf(stdout, "---------------------------------------\n"); + } +} + +static void worker_pkt_validate(void) +{ + u32 payloadseqnum = -2; + + while (1) { + pkt_node_rx_q = malloc(sizeof(struct pkt)); + pkt_node_rx_q = TAILQ_LAST(&head, head_s); + if (!pkt_node_rx_q) + break; + /*do not increment pktcounter if !(tos=0x9 and ipv4) */ + if ((((struct iphdr *)(pkt_node_rx_q->pkt_frame + + sizeof(struct ethhdr)))->version == IP_PKT_VER) + && (((struct iphdr *)(pkt_node_rx_q->pkt_frame + sizeof(struct ethhdr)))->tos == + IP_PKT_TOS)) { + payloadseqnum = *((uint32_t *) (pkt_node_rx_q->pkt_frame + PKT_HDR_SIZE)); + if (debug_pkt_dump && payloadseqnum != EOT) { + pkt_obj = (struct pkt_frame *)malloc(sizeof(struct pkt_frame)); + pkt_obj->payload = (char *)malloc(PKT_SIZE); + memcpy(pkt_obj->payload, pkt_node_rx_q->pkt_frame, PKT_SIZE); + pkt_buf[payloadseqnum] = pkt_obj; + } + + if (payloadseqnum == EOT) { + ksft_print_msg("End-of-tranmission frame received: PASS\n"); + sigvar = 1; + break; + } + + if (prev_pkt + 1 != payloadseqnum) { + ksft_test_result_fail + ("ERROR: [%s] prev_pkt [%d], payloadseqnum [%d]\n", + __func__, prev_pkt, payloadseqnum); + ksft_exit_xfail(); + } + + TAILQ_REMOVE(&head, pkt_node_rx_q, pkt_nodes); + free(pkt_node_rx_q->pkt_frame); + free(pkt_node_rx_q); + pkt_node_rx_q = NULL; + prev_pkt = payloadseqnum; + pkt_counter++; + } else { + ksft_print_msg("Invalid frame received: "); + ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", + ((struct iphdr *)(pkt_node_rx_q->pkt_frame + + sizeof(struct ethhdr)))->version, + ((struct iphdr *)(pkt_node_rx_q->pkt_frame + + sizeof(struct ethhdr)))->tos); + TAILQ_REMOVE(&head, pkt_node_rx_q, pkt_nodes); + free(pkt_node_rx_q->pkt_frame); + free(pkt_node_rx_q); + pkt_node_rx_q = NULL; + } + } +} + +static void thread_common_ops(void *arg, void *bufs, pthread_mutex_t *mutexptr, + atomic_int *spinningptr) +{ + int ctr = 0; + int ret; + + xsk_configure_umem((struct ifobject *)arg, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); + ret = xsk_configure_socket((struct ifobject *)arg); + + /* Retry Create Socket if it fails as xsk_socket__create() + * is asynchronous + * + * Essential to lock Mutex here to prevent Tx thread from + * entering before Rx and causing a deadlock + */ + pthread_mutex_lock(mutexptr); + while (ret && ctr < SOCK_RECONF_CTR) { + atomic_store(spinningptr, 1); + xsk_configure_umem((struct ifobject *)arg, + bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); + ret = xsk_configure_socket((struct ifobject *)arg); + usleep(USLEEP_MAX); + ctr++; + } + atomic_store(spinningptr, 0); + pthread_mutex_unlock(mutexptr); + + if (ctr >= SOCK_RECONF_CTR) + exit_with_error(ret); +} + +static void *worker_testapp_validate(void *arg) +{ + struct udphdr *udp_hdr = + (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr)); + struct generic_data *data = (struct generic_data *)malloc(sizeof(struct generic_data)); + struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + sizeof(struct ethhdr)); + struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; + void *bufs = NULL; + + pthread_attr_setstacksize(&attr, THREAD_STACK); + + if (!bidi_pass) { + bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (bufs == MAP_FAILED) + exit_with_error(errno); + + if (strcmp(((struct ifobject *)arg)->nsname, "")) + switch_namespace(((struct ifobject *)arg)->ifdict_index); + } + + if (((struct ifobject *)arg)->fv.vector == tx) { + int spinningrxctr = 0; + + if (!bidi_pass) + thread_common_ops(arg, bufs, &sync_mutex_tx, &spinning_tx); + + while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) { + spinningrxctr++; + usleep(USLEEP_MAX); + } + + ksft_print_msg("Interface [%s] vector [Tx]\n", ((struct ifobject *)arg)->ifname); + for (int i = 0; i < num_frames; i++) { + /*send EOT frame */ + if (i == (num_frames - 1)) + data->seqnum = -1; + else + data->seqnum = i; + gen_udp_hdr((void *)data, (void *)arg, udp_hdr); + gen_ip_hdr((void *)arg, ip_hdr); + gen_udp_csum(udp_hdr, ip_hdr); + gen_eth_hdr((void *)arg, eth_hdr); + gen_eth_frame(((struct ifobject *)arg)->umem, + i * XSK_UMEM__DEFAULT_FRAME_SIZE); + } + + free(data); + ksft_print_msg("Sending %d packets on interface %s\n", + (opt_pkt_count - 1), ((struct ifobject *)arg)->ifname); + tx_only_all(arg); + } else if (((struct ifobject *)arg)->fv.vector == rx) { + struct pollfd fds[MAX_SOCKS] = { }; + int ret; + + if (!bidi_pass) + thread_common_ops(arg, bufs, &sync_mutex_tx, &spinning_rx); + + ksft_print_msg("Interface [%s] vector [Rx]\n", ((struct ifobject *)arg)->ifname); + xsk_populate_fill_ring(((struct ifobject *)arg)->umem); + + TAILQ_INIT(&head); + if (debug_pkt_dump) { + pkt_buf = malloc(sizeof(struct pkt_frame **) * num_frames); + if (!pkt_buf) + exit_with_error(errno); + } + + fds[0].fd = xsk_socket__fd(((struct ifobject *)arg)->xsk->xsk); + fds[0].events = POLLIN; + + pthread_mutex_lock(&sync_mutex); + pthread_cond_signal(&signal_rx_condition); + pthread_mutex_unlock(&sync_mutex); + + while (1) { + if (opt_poll) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret <= 0) + continue; + } + rx_pkt(((struct ifobject *)arg)->xsk, fds); + worker_pkt_validate(); + + if (sigvar) + break; + } + + ksft_print_msg("Received %d packets on interface %s\n", + pkt_counter, ((struct ifobject *)arg)->ifname); + + if (opt_teardown) + ksft_print_msg("Destroying socket\n"); + } + + if (!opt_bidi || (opt_bidi && bidi_pass)) { + xsk_socket__delete(((struct ifobject *)arg)->xsk->xsk); + (void)xsk_umem__delete(((struct ifobject *)arg)->umem->umem); + } + pthread_exit(NULL); +} + +static void testapp_validate(void) +{ + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, THREAD_STACK); + + if (opt_bidi && bidi_pass) { + pthread_init_mutex(); + if (!switching_notify) { + ksft_print_msg("Switching Tx/Rx vectors\n"); + switching_notify++; + } + } + + pthread_mutex_lock(&sync_mutex); + + /*Spawn RX thread */ + if (!opt_bidi || (opt_bidi && !bidi_pass)) { + if (pthread_create(&t0, &attr, worker_testapp_validate, (void *)ifdict[1])) + exit_with_error(errno); + } else if (opt_bidi && bidi_pass) { + /*switch Tx/Rx vectors */ + ifdict[0]->fv.vector = rx; + if (pthread_create(&t0, &attr, worker_testapp_validate, (void *)ifdict[0])) + exit_with_error(errno); + } + + struct timespec max_wait = { 0, 0 }; + + if (clock_gettime(CLOCK_REALTIME, &max_wait)) + exit_with_error(errno); + max_wait.tv_sec += TMOUT_SEC; + + if (pthread_cond_timedwait(&signal_rx_condition, &sync_mutex, &max_wait) == ETIMEDOUT) + exit_with_error(errno); + + pthread_mutex_unlock(&sync_mutex); + + /*Spawn TX thread */ + if (!opt_bidi || (opt_bidi && !bidi_pass)) { + if (pthread_create(&t1, &attr, worker_testapp_validate, (void *)ifdict[0])) + exit_with_error(errno); + } else if (opt_bidi && bidi_pass) { + /*switch Tx/Rx vectors */ + ifdict[1]->fv.vector = tx; + if (pthread_create(&t1, &attr, worker_testapp_validate, (void *)ifdict[1])) + exit_with_error(errno); + } + + pthread_join(t1, NULL); + pthread_join(t0, NULL); + + if (debug_pkt_dump) { + worker_pkt_dump(); + for (int iter = 0; iter < num_frames - 1; iter++) { + free(pkt_buf[iter]->payload); + free(pkt_buf[iter]); + } + free(pkt_buf); + } + + if (!opt_teardown && !opt_bidi) + print_ksft_result(); +} + +static void testapp_sockets(void) +{ + for (int i = 0; i < (opt_teardown ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); i++) { + pkt_counter = 0; + prev_pkt = -1; + sigvar = 0; + ksft_print_msg("Creating socket\n"); + testapp_validate(); + opt_bidi ? bidi_pass++ : bidi_pass; + } + + print_ksft_result(); +} + +static void init_iface_config(void *ifaceconfig) +{ + /*Init interface0 */ + ifdict[0]->fv.vector = tx; + memcpy(ifdict[0]->dst_mac, ((struct ifaceconfigobj *)ifaceconfig)->dst_mac, ETH_ALEN); + memcpy(ifdict[0]->src_mac, ((struct ifaceconfigobj *)ifaceconfig)->src_mac, ETH_ALEN); + ifdict[0]->dst_ip = ((struct ifaceconfigobj *)ifaceconfig)->dst_ip.s_addr; + ifdict[0]->src_ip = ((struct ifaceconfigobj *)ifaceconfig)->src_ip.s_addr; + ifdict[0]->dst_port = ((struct ifaceconfigobj *)ifaceconfig)->dst_port; + ifdict[0]->src_port = ((struct ifaceconfigobj *)ifaceconfig)->src_port; + + /*Init interface1 */ + ifdict[1]->fv.vector = rx; + memcpy(ifdict[1]->dst_mac, ((struct ifaceconfigobj *)ifaceconfig)->src_mac, ETH_ALEN); + memcpy(ifdict[1]->src_mac, ((struct ifaceconfigobj *)ifaceconfig)->dst_mac, ETH_ALEN); + ifdict[1]->dst_ip = ((struct ifaceconfigobj *)ifaceconfig)->src_ip.s_addr; + ifdict[1]->src_ip = ((struct ifaceconfigobj *)ifaceconfig)->dst_ip.s_addr; + ifdict[1]->dst_port = ((struct ifaceconfigobj *)ifaceconfig)->src_port; + ifdict[1]->src_port = ((struct ifaceconfigobj *)ifaceconfig)->dst_port; +} + +int main(int argc, char **argv) +{ + struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; + + if (setrlimit(RLIMIT_MEMLOCK, &_rlim)) + exit_with_error(errno); + + const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; + const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; + const char *IP1 = "192.168.100.162"; + const char *IP2 = "192.168.100.161"; + u16 UDP_DST_PORT = 2020; + u16 UDP_SRC_PORT = 2121; + + ifaceconfig = (struct ifaceconfigobj *)malloc(sizeof(struct ifaceconfigobj)); + memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN); + memcpy(ifaceconfig->src_mac, MAC2, ETH_ALEN); + inet_aton(IP1, &ifaceconfig->dst_ip); + inet_aton(IP2, &ifaceconfig->src_ip); + ifaceconfig->dst_port = UDP_DST_PORT; + ifaceconfig->src_port = UDP_SRC_PORT; + + for (int i = 0; i < MAX_INTERFACES; i++) { + ifdict[i] = (struct ifobject *)malloc(sizeof(struct ifobject)); + if (!ifdict[i]) + exit_with_error(errno); + + ifdict[i]->ifdict_index = i; + } + + setlocale(LC_ALL, ""); + + parse_command_line(argc, argv); + + num_frames = ++opt_pkt_count; + + init_iface_config((void *)ifaceconfig); + + pthread_init_mutex(); + + ksft_set_plan(1); + + if (!opt_teardown && !opt_bidi) { + testapp_validate(); + } else if (opt_teardown && opt_bidi) { + ksft_test_result_fail("ERROR: parameters -T and -B cannot be used together\n"); + ksft_exit_xfail(); + } else { + testapp_sockets(); + } + + for (int i = 0; i < MAX_INTERFACES; i++) + free(ifdict[i]); + + pthread_destroy_mutex(); + + ksft_exit_pass(); + + return 0; +} diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h new file mode 100644 index 000000000000..61f595b6f200 --- /dev/null +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2020 Intel Corporation. + */ + +#ifndef XDPXCEIVER_H_ +#define XDPXCEIVER_H_ + +#ifndef SOL_XDP +#define SOL_XDP 283 +#endif + +#ifndef AF_XDP +#define AF_XDP 44 +#endif + +#ifndef PF_XDP +#define PF_XDP AF_XDP +#endif + +#define MAX_INTERFACES 2 +#define MAX_INTERFACE_NAME_CHARS 7 +#define MAX_INTERFACES_NAMESPACE_CHARS 10 +#define MAX_SOCKS 1 +#define MAX_TEARDOWN_ITER 10 +#define MAX_BIDI_ITER 2 +#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct udphdr)) +#define MIN_PKT_SIZE 64 +#define ETH_FCS_SIZE 4 +#define PKT_SIZE (MIN_PKT_SIZE - ETH_FCS_SIZE) +#define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) +#define IP_PKT_VER 0x4 +#define IP_PKT_TOS 0x9 +#define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) +#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) +#define TMOUT_SEC (3) +#define EOT (-1) +#define USLEEP_MAX 200000 +#define THREAD_STACK 60000000 +#define SOCK_RECONF_CTR 10 +#define BATCH_SIZE 64 +#define POLL_TMOUT 1000 +#define NEED_WAKEUP true + +typedef __u32 u32; +typedef __u16 u16; +typedef __u8 u8; + +enum TESTS { + ORDER_CONTENT_VALIDATE_XDP_SKB = 0, + ORDER_CONTENT_VALIDATE_XDP_DRV = 1, +}; + +u8 uut; +u8 debug_pkt_dump; +u32 num_frames; +u8 switching_notify; +u8 bidi_pass; + +static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static int opt_queue; +static int opt_pkt_count; +static int opt_poll; +static int opt_teardown; +static int opt_bidi; +static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; +static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; +static u32 pkt_counter; +static u32 prev_pkt = -1; +static int sigvar; + +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + void *buffer; +}; + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; + unsigned long rx_npkts; + unsigned long tx_npkts; + unsigned long prev_rx_npkts; + unsigned long prev_tx_npkts; + u32 outstanding_tx; +}; + +struct flow_vector { + enum fvector { + tx, + rx, + bidi, + undef, + } vector; +}; + +struct generic_data { + u32 seqnum; +}; + +struct ifaceconfigobj { + u8 dst_mac[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; + struct in_addr dst_ip; + struct in_addr src_ip; + u16 src_port; + u16 dst_port; +} *ifaceconfig; + +struct ifobject { + int ifindex; + int ifdict_index; + char ifname[MAX_INTERFACE_NAME_CHARS]; + char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; + struct flow_vector fv; + struct xsk_socket_info *xsk; + struct xsk_umem_info *umem; + u8 dst_mac[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; + u32 dst_ip; + u32 src_ip; + u16 src_port; + u16 dst_port; +}; + +static struct ifobject *ifdict[MAX_INTERFACES]; + +/*threads*/ +atomic_int spinning_tx; +atomic_int spinning_rx; +pthread_mutex_t sync_mutex; +pthread_mutex_t sync_mutex_tx; +pthread_cond_t signal_rx_condition; +pthread_cond_t signal_tx_condition; +pthread_t t0, t1, ns_thread; +pthread_attr_t attr; + +struct targs { + bool retptr; + int idx; +}; + +TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); +struct head_s *head_p; +struct pkt { + char *pkt_frame; + + TAILQ_ENTRY(pkt) pkt_nodes; +} *pkt_node_rx, *pkt_node_rx_q; + +struct pkt_frame { + char *payload; +} *pkt_obj; + +struct pkt_frame **pkt_buf; + +#endif /* XDPXCEIVER_H */ diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh new file mode 100755 index 000000000000..9d54c4645127 --- /dev/null +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2020 Intel Corporation. + +ksft_pass=0 +ksft_fail=1 +ksft_xfail=2 +ksft_xpass=3 +ksft_skip=4 + +GREEN='\033[0;92m' +YELLOW='\033[0;93m' +RED='\033[0;31m' +NC='\033[0m' +STACK_LIM=131072 +SPECFILE=veth.spec +XSKOBJ=xdpxceiver +NUMPKTS=10000 + +validate_root_exec() +{ + msg="skip all tests:" + if [ $UID != 0 ]; then + echo $msg must be run as root >&2 + test_exit $ksft_fail 2 + else + return $ksft_pass + fi +} + +validate_veth_support() +{ + msg="skip all tests:" + if [ $(ip link add $1 type veth 2>/dev/null; echo $?;) != 0 ]; then + echo $msg veth kernel support not available >&2 + test_exit $ksft_skip 1 + else + ip link del $1 + return $ksft_pass + fi +} + +validate_veth_spec_file() +{ + if [ ! -f ${SPECFILE} ]; then + test_exit $ksft_skip 1 + fi +} + +test_status() +{ + statusval=$1 + if [ -n "${colorconsole+set}" ]; then + if [ $statusval -eq 2 ]; then + echo -e "${YELLOW}$2${NC}: [ ${RED}FAIL${NC} ]" + elif [ $statusval -eq 1 ]; then + echo -e "${YELLOW}$2${NC}: [ ${RED}SKIPPED${NC} ]" + elif [ $statusval -eq 0 ]; then + echo -e "${YELLOW}$2${NC}: [ ${GREEN}PASS${NC} ]" + fi + else + if [ $statusval -eq 2 ]; then + echo -e "$2: [ FAIL ]" + elif [ $statusval -eq 1 ]; then + echo -e "$2: [ SKIPPED ]" + elif [ $statusval -eq 0 ]; then + echo -e "$2: [ PASS ]" + fi + fi +} + +test_exit() +{ + retval=$1 + if [ $2 -ne 0 ]; then + test_status $2 $(basename $0) + fi + exit $retval +} + +clear_configs() +{ + if [ $(ip netns show | grep $3 &>/dev/null; echo $?;) == 0 ]; then + [ $(ip netns exec $3 ip link show $2 &>/dev/null; echo $?;) == 0 ] && + { echo "removing link $1:$2"; ip netns exec $3 ip link del $2; } + echo "removing ns $3" + ip netns del $3 + fi + #Once we delete a veth pair node, the entire veth pair is removed, + #this is just to be cautious just incase the NS does not exist then + #veth node inside NS won't get removed so we explicitly remove it + [ $(ip link show $1 &>/dev/null; echo $?;) == 0 ] && + { echo "removing link $1"; ip link del $1; } + if [ -f ${SPECFILE} ]; then + echo "removing spec file:" ${SPECFILE} + rm -f ${SPECFILE} + fi +} + +cleanup_exit() +{ + echo "cleaning up..." + clear_configs $1 $2 $3 +} + +validate_ip_utility() +{ + [ ! $(type -P ip) ] && { echo "'ip' not found. Skipping tests."; test_exit $ksft_skip 1; } +} + +vethXDPgeneric() +{ + ip link set dev $1 xdpdrv off + ip netns exec $3 ip link set dev $2 xdpdrv off +} + +vethXDPnative() +{ + ip link set dev $1 xdpgeneric off + ip netns exec $3 ip link set dev $2 xdpgeneric off +} + +execxdpxceiver() +{ + local -a 'paramkeys=("${!'"$1"'[@]}")' copy + paramkeysstr=${paramkeys[*]} + + for index in $paramkeysstr; + do + current=$1"[$index]" + copy[$index]=${!current} + done + + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} +} diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 05853b0b8831..027014662fb2 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -337,13 +337,13 @@ pid_t clone_into_cgroup(int cgroup_fd) #ifdef CLONE_ARGS_SIZE_VER2 pid_t pid; - struct clone_args args = { + struct __clone_args args = { .flags = CLONE_INTO_CGROUP, .exit_signal = SIGCHLD, .cgroup = cgroup_fd, }; - pid = sys_clone3(&args, sizeof(struct clone_args)); + pid = sys_clone3(&args, sizeof(struct __clone_args)); /* * Verify that this is a genuine test failure: * ENOSYS -> clone3() not available diff --git a/tools/testing/selftests/clone3/Makefile b/tools/testing/selftests/clone3/Makefile index ef7564cb7abe..79b19a2863a0 100644 --- a/tools/testing/selftests/clone3/Makefile +++ b/tools/testing/selftests/clone3/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -g -I../../../../usr/include/ +CFLAGS += -g -std=gnu99 -I../../../../usr/include/ LDLIBS += -lcap TEST_GEN_PROGS := clone3 clone3_clear_sighand clone3_set_tid \ diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index 575b391ddc78..87e16d65d9d7 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -11,6 +11,7 @@ #include <string.h> #include <syscall.h> #include <unistd.h> +#include <sys/resource.h> #include "../kselftest_harness.h" #include "../clone3/clone3_selftests.h" @@ -23,6 +24,10 @@ #define CLOSE_RANGE_UNSHARE (1U << 1) #endif +#ifndef CLOSE_RANGE_CLOEXEC +#define CLOSE_RANGE_CLOEXEC (1U << 2) +#endif + static inline int sys_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) { @@ -224,4 +229,73 @@ TEST(close_range_unshare_capped) EXPECT_EQ(0, WEXITSTATUS(status)); } +TEST(close_range_cloexec) +{ + int i, ret; + int open_fds[101]; + struct rlimit rlimit; + + for (i = 0; i < ARRAY_SIZE(open_fds); i++) { + int fd; + + fd = open("/dev/null", O_RDONLY); + ASSERT_GE(fd, 0) { + if (errno == ENOENT) + XFAIL(return, "Skipping test since /dev/null does not exist"); + } + + open_fds[i] = fd; + } + + ret = sys_close_range(1000, 1000, CLOSE_RANGE_CLOEXEC); + if (ret < 0) { + if (errno == ENOSYS) + XFAIL(return, "close_range() syscall not supported"); + if (errno == EINVAL) + XFAIL(return, "close_range() doesn't support CLOSE_RANGE_CLOEXEC"); + } + + /* Ensure the FD_CLOEXEC bit is set also with a resource limit in place. */ + ASSERT_EQ(0, getrlimit(RLIMIT_NOFILE, &rlimit)); + rlimit.rlim_cur = 25; + ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlimit)); + + /* Set close-on-exec for two ranges: [0-50] and [75-100]. */ + ret = sys_close_range(open_fds[0], open_fds[50], CLOSE_RANGE_CLOEXEC); + ASSERT_EQ(0, ret); + ret = sys_close_range(open_fds[75], open_fds[100], CLOSE_RANGE_CLOEXEC); + ASSERT_EQ(0, ret); + + for (i = 0; i <= 50; i++) { + int flags = fcntl(open_fds[i], F_GETFD); + + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + } + + for (i = 51; i <= 74; i++) { + int flags = fcntl(open_fds[i], F_GETFD); + + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, 0); + } + + for (i = 75; i <= 100; i++) { + int flags = fcntl(open_fds[i], F_GETFD); + + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + } + + /* Test a common pattern. */ + ret = sys_close_range(3, UINT_MAX, CLOSE_RANGE_CLOEXEC); + for (i = 0; i <= 100; i++) { + int flags = fcntl(open_fds[i], F_GETFD); + + EXPECT_GT(flags, -1); + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); + } +} + + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh index f5abb1ebd392..4029833f7e27 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh @@ -52,6 +52,7 @@ ALL_TESTS=" blackhole_route_test irif_disabled_test erif_disabled_test + blackhole_nexthop_test " NUM_NETIFS=4 @@ -647,6 +648,41 @@ erif_disabled_test() devlink_trap_action_set $trap_name "drop" } +__blackhole_nexthop_test() +{ + local flags=$1; shift + local subnet=$1; shift + local proto=$1; shift + local dip=$1; shift + local trap_name="blackhole_nexthop" + local mz_pid + + RET=0 + + ip -$flags nexthop add id 1 blackhole + ip -$flags route add $subnet nhid 1 + tc filter add dev $rp2 egress protocol $proto pref 1 handle 101 \ + flower skip_hw dst_ip $dip ip_proto udp action drop + + # Generate packets to the blackhole nexthop + $MZ $h1 -$flags -t udp "sp=54321,dp=12345" -c 0 -p 100 -b $rp1mac \ + -B $dip -d 1msec -q & + mz_pid=$! + + devlink_trap_drop_test $trap_name $rp2 101 + log_test "Blackhole nexthop: IPv$flags" + + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 + ip -$flags route del $subnet + ip -$flags nexthop del id 1 +} + +blackhole_nexthop_test() +{ + __blackhole_nexthop_test "4" "198.51.100.0/30" "ip" $h2_ipv4 + __blackhole_nexthop_test "6" "2001:db8:2::/120" "ipv6" $h2_ipv6 +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/mlxsw/q_in_q_veto.sh b/tools/testing/selftests/drivers/net/mlxsw/q_in_q_veto.sh new file mode 100755 index 000000000000..7edaed8eb86a --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/q_in_q_veto.sh @@ -0,0 +1,296 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + create_8021ad_vlan_upper_on_top_front_panel_port + create_8021ad_vlan_upper_on_top_bridge_port + create_8021ad_vlan_upper_on_top_lag + create_8021ad_vlan_upper_on_top_bridge + create_8021ad_vlan_upper_on_top_8021ad_bridge + create_vlan_upper_on_top_8021ad_bridge + create_vlan_upper_on_top_front_panel_enslaved_to_8021ad_bridge + create_vlan_upper_on_top_lag_enslaved_to_8021ad_bridge + enslave_front_panel_with_vlan_upper_to_8021ad_bridge + enslave_lag_with_vlan_upper_to_8021ad_bridge + add_ip_address_to_8021ad_bridge + switch_bridge_protocol_from_8021q_to_8021ad +" +NUM_NETIFS=2 +source $lib_dir/lib.sh + +setup_prepare() +{ + swp1=${NETIFS[p1]} + swp2=${NETIFS[p2]} + + ip link set dev $swp1 up + ip link set dev $swp2 up + + sleep 10 +} + +cleanup() +{ + pre_cleanup + + ip link set dev $swp2 down + ip link set dev $swp1 down +} + +create_vlan_upper_on_top_of_bridge() +{ + RET=0 + + local bridge_proto=$1; shift + local netdev_proto=$1; shift + + ip link add dev br0 type bridge vlan_filtering 1 \ + vlan_protocol $bridge_proto vlan_default_pvid 0 mcast_snooping 0 + + ip link set dev br0 up + ip link set dev $swp1 master br0 + + ip link add name br0.100 link br0 type vlan \ + protocol $netdev_proto id 100 2>/dev/null + check_fail $? "$netdev_proto vlan upper creation on top of an $bridge_proto bridge not rejected" + + ip link add name br0.100 link br0 type vlan \ + protocol $netdev_proto id 100 2>&1 >/dev/null \ + | grep -q mlxsw_spectrum + check_err $? "$netdev_proto vlan upper creation on top of an $bridge_proto bridge rejected without extack" + + log_test "create $netdev_proto vlan upper on top $bridge_proto bridge" + + ip link del dev br0 +} + +create_8021ad_vlan_upper_on_top_front_panel_port() +{ + RET=0 + + ip link add name $swp1.100 link $swp1 type vlan \ + protocol 802.1ad id 100 2>/dev/null + check_fail $? "802.1ad vlan upper creation on top of a front panel not rejected" + + ip link add name $swp1.100 link $swp1 type vlan \ + protocol 802.1ad id 100 2>&1 >/dev/null \ + | grep -q mlxsw_spectrum + check_err $? "802.1ad vlan upper creation on top of a front panel rejected without extack" + + log_test "create 802.1ad vlan upper on top of a front panel" +} + +create_8021ad_vlan_upper_on_top_bridge_port() +{ + RET=0 + + ip link add dev br0 type bridge vlan_filtering 1 \ + vlan_default_pvid 0 mcast_snooping 0 + + ip link set dev $swp1 master br0 + ip link set dev br0 up + + ip link add name $swp1.100 link $swp1 type vlan \ + protocol 802.1ad id 100 2>/dev/null + check_fail $? "802.1ad vlan upper creation on top of a bridge port not rejected" + + ip link add name $swp1.100 link $swp1 type vlan \ + protocol 802.1ad id 100 2>&1 >/dev/null \ + | grep -q mlxsw_spectrum + check_err $? "802.1ad vlan upper creation on top of a bridge port rejected without extack" + + log_test "create 802.1ad vlan upper on top of a bridge port" + + ip link del dev br0 +} + +create_8021ad_vlan_upper_on_top_lag() +{ + RET=0 + + ip link add name bond1 type bond mode 802.3ad + ip link set dev $swp1 down + ip link set dev $swp1 master bond1 + + ip link add name bond1.100 link bond1 type vlan \ + protocol 802.1ad id 100 2>/dev/null + check_fail $? "802.1ad vlan upper creation on top of a lag not rejected" + + ip link add name bond1.100 link bond1 type vlan \ + protocol 802.1ad id 100 2>&1 >/dev/null \ + | grep -q mlxsw_spectrum + check_err $? "802.1ad vlan upper creation on top of a lag rejected without extack" + + log_test "create 802.1ad vlan upper on top of a lag" + + ip link del dev bond1 +} + +create_8021ad_vlan_upper_on_top_bridge() +{ + RET=0 + + create_vlan_upper_on_top_of_bridge "802.1q" "802.1ad" +} + +create_8021ad_vlan_upper_on_top_8021ad_bridge() +{ + RET=0 + + create_vlan_upper_on_top_of_bridge "802.1ad" "802.1ad" +} + +create_vlan_upper_on_top_8021ad_bridge() +{ + RET=0 + + create_vlan_upper_on_top_of_bridge "802.1ad" "802.1q" +} + +create_vlan_upper_on_top_front_panel_enslaved_to_8021ad_bridge() +{ + RET=0 + + ip link add dev br0 type bridge vlan_filtering 1 \ + vlan_protocol 802.1ad vlan_default_pvid 0 mcast_snooping 0 + ip link set dev br0 up + + ip link set dev $swp1 master br0 + + ip link add name $swp1.100 link $swp1 type vlan id 100 2>/dev/null + check_fail $? "vlan upper creation on top of front panel enslaved to 802.1ad bridge not rejected" + + ip link add name $swp1.100 link $swp1 type vlan id 100 2>&1 >/dev/null \ + | grep -q mlxsw_spectrum + check_err $? "vlan upper creation on top of front panel enslaved to 802.1ad bridge rejected without extack" + + log_test "create vlan upper on top of front panel enslaved to 802.1ad bridge" + + ip link del dev br0 +} + +create_vlan_upper_on_top_lag_enslaved_to_8021ad_bridge() +{ + RET=0 + + ip link add dev br0 type bridge vlan_filtering 1 \ + vlan_protocol 802.1ad vlan_default_pvid 0 mcast_snooping 0 + ip link set dev br0 up + + ip link add name bond1 type bond mode 802.3ad + ip link set dev $swp1 down + ip link set dev $swp1 master bond1 + ip link set dev bond1 master br0 + + ip link add name bond1.100 link bond1 type vlan id 100 2>/dev/null + check_fail $? "vlan upper creation on top of lag enslaved to 802.1ad bridge not rejected" + + ip link add name bond1.100 link bond1 type vlan id 100 2>&1 >/dev/null \ + | grep -q mlxsw_spectrum + check_err $? "vlan upper creation on top of lag enslaved to 802.1ad bridge rejected without extack" + + log_test "create vlan upper on top of lag enslaved to 802.1ad bridge" + + ip link del dev bond1 + ip link del dev br0 +} + +enslave_front_panel_with_vlan_upper_to_8021ad_bridge() +{ + RET=0 + + ip link add dev br0 type bridge vlan_filtering 1 \ + vlan_protocol 802.1ad vlan_default_pvid 0 mcast_snooping 0 + ip link set dev br0 up + + ip link add name $swp1.100 link $swp1 type vlan id 100 + + ip link set dev $swp1 master br0 2>/dev/null + check_fail $? "front panel with vlan upper enslavemnt to 802.1ad bridge not rejected" + + ip link set dev $swp1 master br0 2>&1 >/dev/null | grep -q mlxsw_spectrum + check_err $? "front panel with vlan upper enslavemnt to 802.1ad bridge rejected without extack" + + log_test "enslave front panel with vlan upper to 802.1ad bridge" + + ip link del dev $swp1.100 + ip link del dev br0 +} + +enslave_lag_with_vlan_upper_to_8021ad_bridge() +{ + RET=0 + + ip link add dev br0 type bridge vlan_filtering 1 \ + vlan_protocol 802.1ad vlan_default_pvid 0 mcast_snooping 0 + ip link set dev br0 up + + ip link add name bond1 type bond mode 802.3ad + ip link set dev $swp1 down + ip link set dev $swp1 master bond1 + ip link add name bond1.100 link bond1 type vlan id 100 + + ip link set dev bond1 master br0 2>/dev/null + check_fail $? "lag with vlan upper enslavemnt to 802.1ad bridge not rejected" + + ip link set dev bond1 master br0 2>&1 >/dev/null \ + | grep -q mlxsw_spectrum + check_err $? "lag with vlan upper enslavemnt to 802.1ad bridge rejected without extack" + + log_test "enslave lag with vlan upper to 802.1ad bridge" + + ip link del dev bond1 + ip link del dev br0 +} + + +add_ip_address_to_8021ad_bridge() +{ + RET=0 + + ip link add dev br0 type bridge vlan_filtering 1 \ + vlan_protocol 802.1ad vlan_default_pvid 0 mcast_snooping 0 + + ip link set dev br0 up + ip link set dev $swp1 master br0 + + ip addr add dev br0 192.0.2.17/28 2>/dev/null + check_fail $? "IP address addition to 802.1ad bridge not rejected" + + ip addr add dev br0 192.0.2.17/28 2>&1 >/dev/null | grep -q mlxsw_spectrum + check_err $? "IP address addition to 802.1ad bridge rejected without extack" + + log_test "IP address addition to 802.1ad bridge" + + ip link del dev br0 +} + +switch_bridge_protocol_from_8021q_to_8021ad() +{ + RET=0 + + ip link add dev br0 type bridge vlan_filtering 1 \ + vlan_protocol 802.1ad vlan_default_pvid 0 mcast_snooping 0 + + ip link set dev br0 up + ip link set dev $swp1 master br0 + + ip link set dev br0 type bridge vlan_protocol 802.1q 2>/dev/null + check_fail $? "switching bridge protocol from 802.1q to 802.1ad not rejected" + + log_test "switch bridge protocol" + + ip link del dev br0 +} + + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh index f4031002d5e9..ed346da5d3cb 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh @@ -22,6 +22,7 @@ ALL_TESTS=" duplicate_vlans_test vlan_rif_refcount_test subport_rif_refcount_test + subport_rif_lag_join_test vlan_dev_deletion_test lag_unlink_slaves_test lag_dev_deletion_test @@ -29,6 +30,11 @@ ALL_TESTS=" bridge_extern_learn_test neigh_offload_test nexthop_offload_test + nexthop_obj_invalid_test + nexthop_obj_offload_test + nexthop_obj_group_offload_test + nexthop_obj_blackhole_offload_test + nexthop_obj_route_offload_test devlink_reload_test " NUM_NETIFS=2 @@ -435,6 +441,48 @@ subport_rif_refcount_test() ip link del dev bond1 } +subport_rif_lag_join_test() +{ + # Test that the reference count of a RIF configured for a LAG is + # incremented / decremented when ports join / leave the LAG. We use the + # offload indication on routes configured on the RIF to understand if + # it was created / destroyed + RET=0 + + ip link add name bond1 type bond mode 802.3ad + ip link set dev $swp1 down + ip link set dev $swp2 down + ip link set dev $swp1 master bond1 + ip link set dev $swp2 master bond1 + + ip link set dev bond1 up + ip -6 address add 2001:db8:1::1/64 dev bond1 + + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev bond1 + check_err $? "subport rif was not created on lag device" + + ip link set dev $swp1 nomaster + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev bond1 + check_err $? "subport rif of lag device was destroyed after removing one port" + + ip link set dev $swp1 master bond1 + ip link set dev $swp2 nomaster + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev bond1 + check_err $? "subport rif of lag device was destroyed after re-adding a port and removing another" + + ip link set dev $swp1 nomaster + busywait "$TIMEOUT" not wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev bond1 + check_err $? "subport rif of lag device was not destroyed when should" + + log_test "subport rif lag join" + + ip link del dev bond1 +} + vlan_dev_deletion_test() { # Test that VLAN devices are correctly deleted / unlinked when enslaved @@ -674,6 +722,209 @@ nexthop_offload_test() sysctl_restore net.ipv6.conf.$swp2.keep_addr_on_down } +nexthop_obj_invalid_test() +{ + # Test that invalid nexthop object configurations are rejected + RET=0 + + simple_if_init $swp1 192.0.2.1/24 2001:db8:1::1/64 + simple_if_init $swp2 192.0.2.2/24 2001:db8:1::2/64 + setup_wait + + ip nexthop add id 1 via 192.0.2.3 fdb + check_fail $? "managed to configure an FDB nexthop when should not" + + ip nexthop add id 1 encap mpls 200/300 via 192.0.2.3 dev $swp1 + check_fail $? "managed to configure a nexthop with MPLS encap when should not" + + ip nexthop add id 1 dev $swp1 + ip nexthop add id 2 dev $swp1 + ip nexthop add id 10 group 1/2 + check_fail $? "managed to configure a nexthop group with device-only nexthops when should not" + + log_test "nexthop objects - invalid configurations" + + ip nexthop del id 2 + ip nexthop del id 1 + + simple_if_fini $swp2 192.0.2.2/24 2001:db8:1::2/64 + simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64 +} + +nexthop_obj_offload_test() +{ + # Test offload indication of nexthop objects + RET=0 + + simple_if_init $swp1 192.0.2.1/24 2001:db8:1::1/64 + simple_if_init $swp2 + setup_wait + + ip nexthop add id 1 via 192.0.2.2 dev $swp1 + ip neigh replace 192.0.2.2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop show id 1 + check_err $? "nexthop not marked as offloaded when should" + + ip neigh replace 192.0.2.2 nud failed dev $swp1 + busywait "$TIMEOUT" not wait_for_offload \ + ip nexthop show id 1 + check_err $? "nexthop marked as offloaded after setting neigh to failed state" + + ip neigh replace 192.0.2.2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop show id 1 + check_err $? "nexthop not marked as offloaded after neigh replace" + + ip nexthop replace id 1 via 192.0.2.3 dev $swp1 + busywait "$TIMEOUT" not wait_for_offload \ + ip nexthop show id 1 + check_err $? "nexthop marked as offloaded after replacing to use an invalid address" + + ip nexthop replace id 1 via 192.0.2.2 dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop show id 1 + check_err $? "nexthop not marked as offloaded after replacing to use a valid address" + + log_test "nexthop objects offload indication" + + ip neigh del 192.0.2.2 dev $swp1 + ip nexthop del id 1 + + simple_if_fini $swp2 + simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64 +} + +nexthop_obj_group_offload_test() +{ + # Test offload indication of nexthop group objects + RET=0 + + simple_if_init $swp1 192.0.2.1/24 2001:db8:1::1/64 + simple_if_init $swp2 + setup_wait + + ip nexthop add id 1 via 192.0.2.2 dev $swp1 + ip nexthop add id 2 via 2001:db8:1::2 dev $swp1 + ip nexthop add id 10 group 1/2 + ip neigh replace 192.0.2.2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + ip neigh replace 192.0.2.3 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + ip neigh replace 2001:db8:1::2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop show id 1 + check_err $? "IPv4 nexthop not marked as offloaded when should" + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop show id 2 + check_err $? "IPv6 nexthop not marked as offloaded when should" + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop show id 10 + check_err $? "nexthop group not marked as offloaded when should" + + # Invalidate nexthop id 1 + ip neigh replace 192.0.2.2 nud failed dev $swp1 + busywait "$TIMEOUT" not wait_for_offload \ + ip nexthop show id 10 + check_fail $? "nexthop group not marked as offloaded with one valid nexthop" + + # Invalidate nexthop id 2 + ip neigh replace 2001:db8:1::2 nud failed dev $swp1 + busywait "$TIMEOUT" not wait_for_offload \ + ip nexthop show id 10 + check_err $? "nexthop group marked as offloaded when should not" + + # Revalidate nexthop id 1 + ip nexthop replace id 1 via 192.0.2.3 dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop show id 10 + check_err $? "nexthop group not marked as offloaded after revalidating nexthop" + + log_test "nexthop group objects offload indication" + + ip neigh del 2001:db8:1::2 dev $swp1 + ip neigh del 192.0.2.3 dev $swp1 + ip neigh del 192.0.2.2 dev $swp1 + ip nexthop del id 10 + ip nexthop del id 2 + ip nexthop del id 1 + + simple_if_fini $swp2 + simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64 +} + +nexthop_obj_blackhole_offload_test() +{ + # Test offload indication of blackhole nexthop objects + RET=0 + + ip nexthop add id 1 blackhole + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop show id 1 + check_err $? "Blackhole nexthop not marked as offloaded when should" + + ip nexthop add id 10 group 1 + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop show id 10 + check_err $? "Nexthop group not marked as offloaded when should" + + log_test "blackhole nexthop objects offload indication" + + ip nexthop del id 10 + ip nexthop del id 1 +} + +nexthop_obj_route_offload_test() +{ + # Test offload indication of routes using nexthop objects + RET=0 + + simple_if_init $swp1 192.0.2.1/24 2001:db8:1::1/64 + simple_if_init $swp2 + setup_wait + + ip nexthop add id 1 via 192.0.2.2 dev $swp1 + ip neigh replace 192.0.2.2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + ip neigh replace 192.0.2.3 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + + ip route replace 198.51.100.0/24 nhid 1 + busywait "$TIMEOUT" wait_for_offload \ + ip route show 198.51.100.0/24 + check_err $? "route not marked as offloaded when using valid nexthop" + + ip nexthop replace id 1 via 192.0.2.3 dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip route show 198.51.100.0/24 + check_err $? "route not marked as offloaded after replacing valid nexthop with a valid one" + + ip nexthop replace id 1 via 192.0.2.4 dev $swp1 + busywait "$TIMEOUT" not wait_for_offload \ + ip route show 198.51.100.0/24 + check_err $? "route marked as offloaded after replacing valid nexthop with an invalid one" + + ip nexthop replace id 1 via 192.0.2.2 dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip route show 198.51.100.0/24 + check_err $? "route not marked as offloaded after replacing invalid nexthop with a valid one" + + log_test "routes using nexthop objects offload indication" + + ip route del 198.51.100.0/24 + ip neigh del 192.0.2.3 dev $swp1 + ip neigh del 192.0.2.2 dev $swp1 + ip nexthop del id 1 + + simple_if_fini $swp2 + simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64 +} + devlink_reload_test() { # Test that after executing all the above configuration tests, a diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh new file mode 100755 index 000000000000..0231205a7147 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +lib_dir=$(dirname $0)/../../../../net/forwarding + +VXPORT=4789 + +ALL_TESTS=" + create_dot1d_and_dot1ad_vxlans +" +NUM_NETIFS=2 +source $lib_dir/lib.sh + +setup_prepare() +{ + swp1=${NETIFS[p1]} + swp2=${NETIFS[p2]} + + ip link set dev $swp1 up + ip link set dev $swp2 up +} + +cleanup() +{ + pre_cleanup + + ip link set dev $swp2 down + ip link set dev $swp1 down +} + +create_dot1d_and_dot1ad_vxlans() +{ + RET=0 + + ip link add dev br0 type bridge vlan_filtering 1 vlan_protocol 802.1ad \ + vlan_default_pvid 0 mcast_snooping 0 + ip link set dev br0 up + + ip link add name vx100 type vxlan id 1000 local 192.0.2.17 dstport \ + "$VXPORT" nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx100 up + + ip link set dev $swp1 master br0 + ip link set dev vx100 master br0 + bridge vlan add vid 100 dev vx100 pvid untagged + + ip link add dev br1 type bridge vlan_filtering 0 mcast_snooping 0 + ip link set dev br1 up + + ip link add name vx200 type vxlan id 2000 local 192.0.2.17 dstport \ + "$VXPORT" nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx200 up + + ip link set dev $swp2 master br1 + ip link set dev vx200 master br1 2>/dev/null + check_fail $? "802.1d and 802.1ad VxLANs at the same time not rejected" + + ip link set dev vx200 master br1 2>&1 >/dev/null \ + | grep -q mlxsw_spectrum + check_err $? "802.1d and 802.1ad VxLANs at the same time rejected without extack" + + log_test "create 802.1d and 802.1ad VxLANs" + + ip link del dev vx200 + ip link del dev br1 + ip link del dev vx100 + ip link del dev br0 +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/q_in_vni_veto.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/q_in_vni_veto.sh new file mode 100755 index 000000000000..f0443b1b05b9 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/q_in_vni_veto.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +lib_dir=$(dirname $0)/../../../../net/forwarding + +VXPORT=4789 + +ALL_TESTS=" + create_vxlan_on_top_of_8021ad_bridge +" +NUM_NETIFS=2 +source $lib_dir/lib.sh + +setup_prepare() +{ + swp1=${NETIFS[p1]} + swp2=${NETIFS[p2]} + + ip link set dev $swp1 up + ip link set dev $swp2 up +} + +cleanup() +{ + pre_cleanup + + ip link set dev $swp2 down + ip link set dev $swp1 down +} + +create_vxlan_on_top_of_8021ad_bridge() +{ + RET=0 + + ip link add dev br0 type bridge vlan_filtering 1 vlan_protocol 802.1ad \ + vlan_default_pvid 0 mcast_snooping 0 + ip link set dev br0 up + + ip link add name vx100 type vxlan id 1000 local 192.0.2.17 dstport \ + "$VXPORT" nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx100 up + + ip link set dev $swp1 master br0 + ip link set dev vx100 master br0 + + bridge vlan add vid 100 dev vx100 pvid untagged 2>/dev/null + check_fail $? "802.1ad bridge with VxLAN in Spectrum-1 not rejected" + + bridge vlan add vid 100 dev vx100 pvid untagged 2>&1 >/dev/null \ + | grep -q mlxsw_spectrum + check_err $? "802.1ad bridge with VxLAN in Spectrum-1 rejected without extack" + + log_test "create VxLAN on top of 802.1ad bridge" + + ip link del dev vx100 + ip link del dev br0 +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-coalesce.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-coalesce.sh new file mode 100755 index 000000000000..9adfba8f87e6 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-coalesce.sh @@ -0,0 +1,132 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only + +source ethtool-common.sh + +function get_value { + local query="${SETTINGS_MAP[$1]}" + + echo $(ethtool -c $NSIM_NETDEV | \ + awk -F':' -v pattern="$query:" '$0 ~ pattern {gsub(/[ \t]/, "", $2); print $2}') +} + +function update_current_settings { + for key in ${!SETTINGS_MAP[@]}; do + CURRENT_SETTINGS[$key]=$(get_value $key) + done + echo ${CURRENT_SETTINGS[@]} +} + +if ! ethtool -h | grep -q coalesce; then + echo "SKIP: No --coalesce support in ethtool" + exit 4 +fi + +NSIM_NETDEV=$(make_netdev) + +set -o pipefail + +declare -A SETTINGS_MAP=( + ["rx-frames-low"]="rx-frame-low" + ["tx-frames-low"]="tx-frame-low" + ["rx-frames-high"]="rx-frame-high" + ["tx-frames-high"]="tx-frame-high" + ["rx-usecs"]="rx-usecs" + ["rx-frames"]="rx-frames" + ["rx-usecs-irq"]="rx-usecs-irq" + ["rx-frames-irq"]="rx-frames-irq" + ["tx-usecs"]="tx-usecs" + ["tx-frames"]="tx-frames" + ["tx-usecs-irq"]="tx-usecs-irq" + ["tx-frames-irq"]="tx-frames-irq" + ["stats-block-usecs"]="stats-block-usecs" + ["pkt-rate-low"]="pkt-rate-low" + ["rx-usecs-low"]="rx-usecs-low" + ["tx-usecs-low"]="tx-usecs-low" + ["pkt-rate-high"]="pkt-rate-high" + ["rx-usecs-high"]="rx-usecs-high" + ["tx-usecs-high"]="tx-usecs-high" + ["sample-interval"]="sample-interval" +) + +declare -A CURRENT_SETTINGS=( + ["rx-frames-low"]="" + ["tx-frames-low"]="" + ["rx-frames-high"]="" + ["tx-frames-high"]="" + ["rx-usecs"]="" + ["rx-frames"]="" + ["rx-usecs-irq"]="" + ["rx-frames-irq"]="" + ["tx-usecs"]="" + ["tx-frames"]="" + ["tx-usecs-irq"]="" + ["tx-frames-irq"]="" + ["stats-block-usecs"]="" + ["pkt-rate-low"]="" + ["rx-usecs-low"]="" + ["tx-usecs-low"]="" + ["pkt-rate-high"]="" + ["rx-usecs-high"]="" + ["tx-usecs-high"]="" + ["sample-interval"]="" +) + +declare -A EXPECTED_SETTINGS=( + ["rx-frames-low"]="" + ["tx-frames-low"]="" + ["rx-frames-high"]="" + ["tx-frames-high"]="" + ["rx-usecs"]="" + ["rx-frames"]="" + ["rx-usecs-irq"]="" + ["rx-frames-irq"]="" + ["tx-usecs"]="" + ["tx-frames"]="" + ["tx-usecs-irq"]="" + ["tx-frames-irq"]="" + ["stats-block-usecs"]="" + ["pkt-rate-low"]="" + ["rx-usecs-low"]="" + ["tx-usecs-low"]="" + ["pkt-rate-high"]="" + ["rx-usecs-high"]="" + ["tx-usecs-high"]="" + ["sample-interval"]="" +) + +# populate the expected settings map +for key in ${!SETTINGS_MAP[@]}; do + EXPECTED_SETTINGS[$key]=$(get_value $key) +done + +# test +for key in ${!SETTINGS_MAP[@]}; do + value=$((RANDOM % $((2**32-1)))) + + ethtool -C $NSIM_NETDEV "$key" "$value" + + EXPECTED_SETTINGS[$key]="$value" + expected=${EXPECTED_SETTINGS[@]} + current=$(update_current_settings) + + check $? "$current" "$expected" + set +x +done + +# bool settings which ethtool displays on the same line +ethtool -C $NSIM_NETDEV adaptive-rx on +s=$(ethtool -c $NSIM_NETDEV | grep -q "Adaptive RX: on TX: off") +check $? "$s" "" + +ethtool -C $NSIM_NETDEV adaptive-tx on +s=$(ethtool -c $NSIM_NETDEV | grep -q "Adaptive RX: on TX: on") +check $? "$s" "" + +if [ $num_errors -eq 0 ]; then + echo "PASSED all $((num_passes)) checks" + exit 0 +else + echo "FAILED $num_errors/$((num_errors+num_passes)) checks" + exit 1 +fi diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh new file mode 100644 index 000000000000..9f64d5c7107b --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only + +NSIM_ID=$((RANDOM % 1024)) +NSIM_DEV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_ID +NSIM_DEV_DFS=/sys/kernel/debug/netdevsim/netdevsim$NSIM_ID/ports/0 +NSIM_NETDEV= +num_passes=0 +num_errors=0 + +function cleanup_nsim { + if [ -e $NSIM_DEV_SYS ]; then + echo $NSIM_ID > /sys/bus/netdevsim/del_device + fi +} + +function cleanup { + cleanup_nsim +} + +trap cleanup EXIT + +function check { + local code=$1 + local str=$2 + local exp_str=$3 + + if [ $code -ne 0 ]; then + ((num_errors++)) + return + fi + + if [ "$str" != "$exp_str" ]; then + echo -e "Expected: '$exp_str', got '$str'" + ((num_errors++)) + return + fi + + ((num_passes++)) +} + +function make_netdev { + # Make a netdevsim + old_netdevs=$(ls /sys/class/net) + + if ! $(lsmod | grep -q netdevsim); then + modprobe netdevsim + fi + + echo $NSIM_ID > /sys/bus/netdevsim/new_device + # get new device name + ls /sys/bus/netdevsim/devices/netdevsim${NSIM_ID}/net/ +} diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-pause.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-pause.sh index 25c896b9e2eb..b4a7abfe5454 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-pause.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-pause.sh @@ -1,60 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0-only -NSIM_ID=$((RANDOM % 1024)) -NSIM_DEV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_ID -NSIM_DEV_DFS=/sys/kernel/debug/netdevsim/netdevsim$NSIM_ID/ports/0 -NSIM_NETDEV= -num_passes=0 -num_errors=0 - -function cleanup_nsim { - if [ -e $NSIM_DEV_SYS ]; then - echo $NSIM_ID > /sys/bus/netdevsim/del_device - fi -} - -function cleanup { - cleanup_nsim -} - -trap cleanup EXIT - -function get_netdev_name { - local -n old=$1 - - new=$(ls /sys/class/net) - - for netdev in $new; do - for check in $old; do - [ $netdev == $check ] && break - done - - if [ $netdev != $check ]; then - echo $netdev - break - fi - done -} - -function check { - local code=$1 - local str=$2 - local exp_str=$3 - - if [ $code -ne 0 ]; then - ((num_errors++)) - return - fi - - if [ "$str" != "$exp_str" ]; then - echo -e "Expected: '$exp_str', got '$str'" - ((num_errors++)) - return - fi - - ((num_passes++)) -} +source ethtool-common.sh # Bail if ethtool is too old if ! ethtool -h | grep include-stat 2>&1 >/dev/null; then @@ -62,13 +9,7 @@ if ! ethtool -h | grep include-stat 2>&1 >/dev/null; then exit 4 fi -# Make a netdevsim -old_netdevs=$(ls /sys/class/net) - -modprobe netdevsim -echo $NSIM_ID > /sys/bus/netdevsim/new_device - -NSIM_NETDEV=`get_netdev_name old_netdevs` +NSIM_NETDEV=$(make_netdev) set -o pipefail diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-ring.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-ring.sh new file mode 100755 index 000000000000..c969559ffa7a --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-ring.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only + +source ethtool-common.sh + +function get_value { + local query="${SETTINGS_MAP[$1]}" + + echo $(ethtool -g $NSIM_NETDEV | \ + tail -n +$CURR_SETT_LINE | \ + awk -F':' -v pattern="$query:" '$0 ~ pattern {gsub(/[\t ]/, "", $2); print $2}') +} + +function update_current_settings { + for key in ${!SETTINGS_MAP[@]}; do + CURRENT_SETTINGS[$key]=$(get_value $key) + done + echo ${CURRENT_SETTINGS[@]} +} + +if ! ethtool -h | grep -q set-ring >/dev/null; then + echo "SKIP: No --set-ring support in ethtool" + exit 4 +fi + +NSIM_NETDEV=$(make_netdev) + +set -o pipefail + +declare -A SETTINGS_MAP=( + ["rx"]="RX" + ["rx-mini"]="RX Mini" + ["rx-jumbo"]="RX Jumbo" + ["tx"]="TX" +) + +declare -A EXPECTED_SETTINGS=( + ["rx"]="" + ["rx-mini"]="" + ["rx-jumbo"]="" + ["tx"]="" +) + +declare -A CURRENT_SETTINGS=( + ["rx"]="" + ["rx-mini"]="" + ["rx-jumbo"]="" + ["tx"]="" +) + +MAX_VALUE=$((RANDOM % $((2**32-1)))) +RING_MAX_LIST=$(ls $NSIM_DEV_DFS/ethtool/ring/) + +for ring_max_entry in $RING_MAX_LIST; do + echo $MAX_VALUE > $NSIM_DEV_DFS/ethtool/ring/$ring_max_entry +done + +CURR_SETT_LINE=$(ethtool -g $NSIM_NETDEV | grep -i -m1 -n 'Current hardware settings' | cut -f1 -d:) + +# populate the expected settings map +for key in ${!SETTINGS_MAP[@]}; do + EXPECTED_SETTINGS[$key]=$(get_value $key) +done + +# test +for key in ${!SETTINGS_MAP[@]}; do + value=$((RANDOM % $MAX_VALUE)) + + ethtool -G $NSIM_NETDEV "$key" "$value" + + EXPECTED_SETTINGS[$key]="$value" + expected=${EXPECTED_SETTINGS[@]} + current=$(update_current_settings) + + check $? "$current" "$expected" + set +x +done + +if [ $num_errors -eq 0 ]; then + echo "PASSED all $((num_passes)) checks" + exit 0 +else + echo "FAILED $num_errors/$((num_errors+num_passes)) checks" + exit 1 +fi diff --git a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh new file mode 100755 index 000000000000..be0c1b5ee6b8 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh @@ -0,0 +1,436 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This test is for checking the nexthop offload API. It makes use of netdevsim +# which registers a listener to the nexthop notification chain. + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + nexthop_single_add_test + nexthop_single_add_err_test + nexthop_group_add_test + nexthop_group_add_err_test + nexthop_group_replace_test + nexthop_group_replace_err_test + nexthop_single_replace_test + nexthop_single_replace_err_test + nexthop_single_in_group_replace_test + nexthop_single_in_group_replace_err_test + nexthop_single_in_group_delete_test + nexthop_single_in_group_delete_err_test + nexthop_replay_test + nexthop_replay_err_test +" +NETDEVSIM_PATH=/sys/bus/netdevsim/ +DEV_ADDR=1337 +DEV=netdevsim${DEV_ADDR} +DEVLINK_DEV=netdevsim/${DEV} +SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/ +NUM_NETIFS=0 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +nexthop_check() +{ + local nharg="$1"; shift + local expected="$1"; shift + + out=$($IP nexthop show ${nharg} | sed -e 's/ *$//') + if [[ "$out" != "$expected" ]]; then + return 1 + fi + + return 0 +} + +nexthop_resource_check() +{ + local expected_occ=$1; shift + + occ=$($DEVLINK -jp resource show $DEVLINK_DEV \ + | jq '.[][][] | select(.name=="nexthops") | .["occ"]') + + if [ $expected_occ -ne $occ ]; then + return 1 + fi + + return 0 +} + +nexthop_resource_set() +{ + local size=$1; shift + + $DEVLINK resource set $DEVLINK_DEV path nexthops size $size + $DEVLINK dev reload $DEVLINK_DEV +} + +nexthop_single_add_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + nexthop_check "id 1" "id 1 via 192.0.2.2 dev dummy1 scope link trap" + check_err $? "Unexpected nexthop entry" + + nexthop_resource_check 1 + check_err $? "Wrong nexthop occupancy" + + $IP nexthop del id 1 + nexthop_resource_check 0 + check_err $? "Wrong nexthop occupancy after delete" + + log_test "Single nexthop add and delete" +} + +nexthop_single_add_err_test() +{ + RET=0 + + nexthop_resource_set 1 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 &> /dev/null + check_fail $? "Nexthop addition succeeded when should fail" + + nexthop_resource_check 1 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop add failure" + + $IP nexthop flush &> /dev/null + nexthop_resource_set 9999 +} + +nexthop_group_add_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + $IP nexthop add id 10 group 1/2 + nexthop_check "id 10" "id 10 group 1/2 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_resource_check 4 + check_err $? "Wrong nexthop occupancy" + + $IP nexthop del id 10 + nexthop_resource_check 2 + check_err $? "Wrong nexthop occupancy after delete" + + $IP nexthop add id 10 group 1,20/2,39 + nexthop_check "id 10" "id 10 group 1,20/2,39 trap" + check_err $? "Unexpected weighted nexthop group entry" + + nexthop_resource_check 61 + check_err $? "Wrong weighted nexthop occupancy" + + $IP nexthop del id 10 + nexthop_resource_check 2 + check_err $? "Wrong nexthop occupancy after delete" + + log_test "Nexthop group add and delete" + + $IP nexthop flush &> /dev/null +} + +nexthop_group_add_err_test() +{ + RET=0 + + nexthop_resource_set 2 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + $IP nexthop add id 10 group 1/2 &> /dev/null + check_fail $? "Nexthop group addition succeeded when should fail" + + nexthop_resource_check 2 + check_err $? "Wrong nexthop occupancy" + + log_test "Nexthop group add failure" + + $IP nexthop flush &> /dev/null + nexthop_resource_set 9999 +} + +nexthop_group_replace_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.4 dev dummy1 + $IP nexthop add id 10 group 1/2 + + $IP nexthop replace id 10 group 1/2/3 + nexthop_check "id 10" "id 10 group 1/2/3 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_resource_check 6 + check_err $? "Wrong nexthop occupancy" + + log_test "Nexthop group replace" + + $IP nexthop flush &> /dev/null +} + +nexthop_group_replace_err_test() +{ + RET=0 + + nexthop_resource_set 5 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.4 dev dummy1 + $IP nexthop add id 10 group 1/2 + + $IP nexthop replace id 10 group 1/2/3 &> /dev/null + check_fail $? "Nexthop group replacement succeeded when should fail" + + nexthop_check "id 10" "id 10 group 1/2 trap" + check_err $? "Unexpected nexthop group entry after failure" + + nexthop_resource_check 5 + check_err $? "Wrong nexthop occupancy after failure" + + log_test "Nexthop group replace failure" + + $IP nexthop flush &> /dev/null + nexthop_resource_set 9999 +} + +nexthop_single_replace_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + + $IP nexthop replace id 1 via 192.0.2.3 dev dummy1 + nexthop_check "id 1" "id 1 via 192.0.2.3 dev dummy1 scope link trap" + check_err $? "Unexpected nexthop entry" + + nexthop_resource_check 1 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop replace" + + $IP nexthop flush &> /dev/null +} + +nexthop_single_replace_err_test() +{ + RET=0 + + # This is supposed to cause the replace to fail because the new nexthop + # is programmed before deleting the replaced one. + nexthop_resource_set 1 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + + $IP nexthop replace id 1 via 192.0.2.3 dev dummy1 &> /dev/null + check_fail $? "Nexthop replace succeeded when should fail" + + nexthop_check "id 1" "id 1 via 192.0.2.2 dev dummy1 scope link trap" + check_err $? "Unexpected nexthop entry after failure" + + nexthop_resource_check 1 + check_err $? "Wrong nexthop occupancy after failure" + + log_test "Single nexthop replace failure" + + $IP nexthop flush &> /dev/null + nexthop_resource_set 9999 +} + +nexthop_single_in_group_replace_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 + + $IP nexthop replace id 1 via 192.0.2.4 dev dummy1 + check_err $? "Failed to replace nexthop when should not" + + nexthop_check "id 10" "id 10 group 1/2 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_resource_check 4 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop replace while in group" + + $IP nexthop flush &> /dev/null +} + +nexthop_single_in_group_replace_err_test() +{ + RET=0 + + nexthop_resource_set 5 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 + + $IP nexthop replace id 1 via 192.0.2.4 dev dummy1 &> /dev/null + check_fail $? "Nexthop replacement succeeded when should fail" + + nexthop_check "id 1" "id 1 via 192.0.2.2 dev dummy1 scope link trap" + check_err $? "Unexpected nexthop entry after failure" + + nexthop_check "id 10" "id 10 group 1/2 trap" + check_err $? "Unexpected nexthop group entry after failure" + + nexthop_resource_check 4 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop replace while in group failure" + + $IP nexthop flush &> /dev/null + nexthop_resource_set 9999 +} + +nexthop_single_in_group_delete_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 + + $IP nexthop del id 1 + nexthop_check "id 10" "id 10 group 2 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_resource_check 2 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop delete while in group" + + $IP nexthop flush &> /dev/null +} + +nexthop_single_in_group_delete_err_test() +{ + RET=0 + + # First, nexthop 1 will be deleted, which will reduce the occupancy to + # 5. Afterwards, a replace notification will be sent for nexthop group + # 10 with only two nexthops. Since the new group is allocated before + # the old is deleted, the replacement will fail as it will result in an + # occupancy of 7. + nexthop_resource_set 6 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.4 dev dummy1 + $IP nexthop add id 10 group 1/2/3 + + $IP nexthop del id 1 + + nexthop_resource_check 5 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop delete while in group failure" + + $IP nexthop flush &> /dev/null + nexthop_resource_set 9999 +} + +nexthop_replay_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 + + $DEVLINK dev reload $DEVLINK_DEV + check_err $? "Failed to reload when should not" + + nexthop_check "id 1" "id 1 via 192.0.2.2 dev dummy1 scope link trap" + check_err $? "Unexpected nexthop entry after reload" + + nexthop_check "id 2" "id 2 via 192.0.2.3 dev dummy1 scope link trap" + check_err $? "Unexpected nexthop entry after reload" + + nexthop_check "id 10" "id 10 group 1/2 trap" + check_err $? "Unexpected nexthop group entry after reload" + + nexthop_resource_check 4 + check_err $? "Wrong nexthop occupancy" + + log_test "Nexthop replay" + + $IP nexthop flush &> /dev/null +} + +nexthop_replay_err_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 + + # Reduce size of nexthop resource so that reload will fail. + $DEVLINK resource set $DEVLINK_DEV path nexthops size 3 + $DEVLINK dev reload $DEVLINK_DEV &> /dev/null + check_fail $? "Reload succeeded when should fail" + + $DEVLINK resource set $DEVLINK_DEV path nexthops size 9999 + $DEVLINK dev reload $DEVLINK_DEV + check_err $? "Failed to reload when should not" + + log_test "Nexthop replay failure" + + $IP nexthop flush &> /dev/null +} + +setup_prepare() +{ + local netdev + + modprobe netdevsim &> /dev/null + + echo "$DEV_ADDR 1" > ${NETDEVSIM_PATH}/new_device + while [ ! -d $SYSFS_NET_DIR ] ; do :; done + + set -e + + ip netns add testns1 + devlink dev reload $DEVLINK_DEV netns testns1 + + IP="ip -netns testns1" + DEVLINK="devlink -N testns1" + + $IP link add name dummy1 up type dummy + $IP address add 192.0.2.1/24 dev dummy1 + + set +e +} + +cleanup() +{ + pre_cleanup + ip netns del testns1 + echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device + modprobe -r netdevsim &> /dev/null +} + +trap cleanup EXIT + +setup_prepare + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile index 32bdc978a711..41582fe485ee 100644 --- a/tools/testing/selftests/gpio/Makefile +++ b/tools/testing/selftests/gpio/Makefile @@ -11,22 +11,21 @@ LDLIBS += $(VAR_LDLIBS) TEST_PROGS := gpio-mockup.sh TEST_FILES := gpio-mockup-sysfs.sh -TEST_PROGS_EXTENDED := gpio-mockup-chardev +TEST_GEN_PROGS_EXTENDED := gpio-mockup-chardev -GPIODIR := $(realpath ../../../gpio) -GPIOOBJ := gpio-utils.o +KSFT_KHDR_INSTALL := 1 +include ../lib.mk -all: $(TEST_PROGS_EXTENDED) +GPIODIR := $(realpath ../../../gpio) +GPIOOUT := $(OUTPUT)/tools-gpio/ +GPIOOBJ := $(GPIOOUT)/gpio-utils.o -override define CLEAN - $(RM) $(TEST_PROGS_EXTENDED) - $(MAKE) -C $(GPIODIR) OUTPUT=$(GPIODIR)/ clean -endef +CLEAN += ; $(RM) -rf $(GPIOOUT) -KSFT_KHDR_INSTALL := 1 -include ../lib.mk +$(TEST_GEN_PROGS_EXTENDED): $(GPIOOBJ) -$(TEST_PROGS_EXTENDED): $(GPIODIR)/$(GPIOOBJ) +$(GPIOOUT): + mkdir -p $@ -$(GPIODIR)/$(GPIOOBJ): - $(MAKE) OUTPUT=$(GPIODIR)/ -C $(GPIODIR) +$(GPIOOBJ): $(GPIOOUT) + $(MAKE) OUTPUT=$(GPIOOUT) -C $(GPIODIR) diff --git a/tools/testing/selftests/intel_pstate/aperf.c b/tools/testing/selftests/intel_pstate/aperf.c index f6cd03a87493..a8acf3996973 100644 --- a/tools/testing/selftests/intel_pstate/aperf.c +++ b/tools/testing/selftests/intel_pstate/aperf.c @@ -10,8 +10,12 @@ #include <sched.h> #include <errno.h> #include <string.h> +#include <time.h> #include "../kselftest.h" +#define MSEC_PER_SEC 1000L +#define NSEC_PER_MSEC 1000000L + void usage(char *name) { printf ("Usage: %s cpunum\n", name); } @@ -22,7 +26,7 @@ int main(int argc, char **argv) { long long tsc, old_tsc, new_tsc; long long aperf, old_aperf, new_aperf; long long mperf, old_mperf, new_mperf; - struct timeb before, after; + struct timespec before, after; long long int start, finish, total; cpu_set_t cpuset; @@ -55,7 +59,10 @@ int main(int argc, char **argv) { return 1; } - ftime(&before); + if (clock_gettime(CLOCK_MONOTONIC, &before) < 0) { + perror("clock_gettime"); + return 1; + } pread(fd, &old_tsc, sizeof(old_tsc), 0x10); pread(fd, &old_aperf, sizeof(old_mperf), 0xe7); pread(fd, &old_mperf, sizeof(old_aperf), 0xe8); @@ -64,7 +71,10 @@ int main(int argc, char **argv) { sqrt(i); } - ftime(&after); + if (clock_gettime(CLOCK_MONOTONIC, &after) < 0) { + perror("clock_gettime"); + return 1; + } pread(fd, &new_tsc, sizeof(new_tsc), 0x10); pread(fd, &new_aperf, sizeof(new_mperf), 0xe7); pread(fd, &new_mperf, sizeof(new_aperf), 0xe8); @@ -73,11 +83,11 @@ int main(int argc, char **argv) { aperf = new_aperf-old_aperf; mperf = new_mperf-old_mperf; - start = before.time*1000 + before.millitm; - finish = after.time*1000 + after.millitm; + start = before.tv_sec*MSEC_PER_SEC + before.tv_nsec/NSEC_PER_MSEC; + finish = after.tv_sec*MSEC_PER_SEC + after.tv_nsec/NSEC_PER_MSEC; total = finish - start; - printf("runTime: %4.2f\n", 1.0*total/1000); + printf("runTime: %4.2f\n", 1.0*total/MSEC_PER_SEC); printf("freq: %7.0f\n", tsc / (1.0*aperf / (1.0 * mperf)) / total); return 0; } diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index b3ece55a2da6..6f441dd9f33c 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -156,14 +156,23 @@ static void guest_code_move_memory_region(void) GUEST_SYNC(0); /* - * Spin until the memory region is moved to a misaligned address. This - * may or may not trigger MMIO, as the window where the memslot is - * invalid is quite small. + * Spin until the memory region starts getting moved to a + * misaligned address. + * Every region move may or may not trigger MMIO, as the + * window where the memslot is invalid is usually quite small. */ val = guest_spin_on_val(0); GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val); - /* Spin until the memory region is realigned. */ + /* Spin until the misaligning memory region move completes. */ + val = guest_spin_on_val(MMIO_VAL); + GUEST_ASSERT_1(val == 1 || val == 0, val); + + /* Spin until the memory region starts to get re-aligned. */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val); + + /* Spin until the re-aligning memory region move completes. */ val = guest_spin_on_val(MMIO_VAL); GUEST_ASSERT_1(val == 1, val); diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt index 74a8d329a72c..92ba4cc41314 100644 --- a/tools/testing/selftests/lkdtm/tests.txt +++ b/tools/testing/selftests/lkdtm/tests.txt @@ -68,3 +68,4 @@ USERCOPY_STACK_BEYOND USERCOPY_KERNEL STACKLEAK_ERASING OK: the rest of the thread stack is properly erased CFI_FORWARD_PROTO +FORTIFIED_STRSCPY diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index b018e835737d..be675002f918 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -20,7 +20,7 @@ #include <inttypes.h> #include <limits.h> #include <linux/falloc.h> -#include <linux/fcntl.h> +#include <fcntl.h> #include <linux/memfd.h> #include <sched.h> #include <stdio.h> diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 334a7eea2004..74baab83fec3 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -6,7 +6,7 @@ #include <inttypes.h> #include <limits.h> #include <linux/falloc.h> -#include <linux/fcntl.h> +#include <fcntl.h> #include <linux/memfd.h> #include <sched.h> #include <stdio.h> diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index ef352477cac6..fa5fa425d148 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -21,6 +21,7 @@ TEST_PROGS += rxtimestamp.sh TEST_PROGS += devlink_port_split.py TEST_PROGS += drop_monitor_tests.sh TEST_PROGS += vrf_route_leaking.sh +TEST_PROGS += bareudp.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/bareudp.sh b/tools/testing/selftests/net/bareudp.sh new file mode 100755 index 000000000000..f366cadbc5e8 --- /dev/null +++ b/tools/testing/selftests/net/bareudp.sh @@ -0,0 +1,546 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# Test various bareudp tunnel configurations. +# +# The bareudp module allows to tunnel network protocols like IP or MPLS over +# UDP, without adding any intermediate header. This scripts tests several +# configurations of bareudp (using IPv4 or IPv6 as underlay and transporting +# IPv4, IPv6 or MPLS packets on the overlay). +# +# Network topology: +# +# * A chain of 4 network namespaces, connected with veth pairs. Each veth +# is assigned an IPv4 and an IPv6 address. A host-route allows a veth to +# join its peer. +# +# * NS0 and NS3 are at the extremities of the chain. They have additional +# IPv4 and IPv6 addresses on their loopback device. Routes are added in NS0 +# and NS3, so that they can communicate using these overlay IP addresses. +# For IPv4 and IPv6 reachability tests, the route simply sets the peer's +# veth address as gateway. For MPLS reachability tests, an MPLS header is +# also pushed before the IP header. +# +# * NS1 and NS2 are the intermediate namespaces. They use a bareudp device to +# encapsulate the traffic into UDP. +# +# +-----------------------------------------------------------------------+ +# | NS0 | +# | | +# | lo: | +# | * IPv4 address: 192.0.2.100/32 | +# | * IPv6 address: 2001:db8::100/128 | +# | * IPv6 address: 2001:db8::200/128 | +# | * IPv4 route: 192.0.2.103/32 reachable via 192.0.2.11 | +# | * IPv6 route: 2001:db8::103/128 reachable via 2001:db8::11 | +# | * IPv6 route: 2001:db8::203/128 reachable via 2001:db8::11 | +# | (encapsulated with MPLS label 203) | +# | | +# | veth01: | +# | ^ * IPv4 address: 192.0.2.10, peer 192.0.2.11/32 | +# | | * IPv6 address: 2001:db8::10, peer 2001:db8::11/128 | +# | | | +# +---+-------------------------------------------------------------------+ +# | +# | Traffic type: IP or MPLS (depending on test) +# | +# +---+-------------------------------------------------------------------+ +# | | NS1 | +# | | | +# | v | +# | veth10: | +# | * IPv4 address: 192.0.2.11, peer 192.0.2.10/32 | +# | * IPv6 address: 2001:db8::11, peer 2001:db8::10/128 | +# | | +# | bareudp_ns1: | +# | * Encapsulate IP or MPLS packets received on veth10 into UDP | +# | and send the resulting packets through veth12. | +# | * Decapsulate bareudp packets (either IP or MPLS, over UDP) | +# | received on veth12 and send the inner packets through veth10. | +# | | +# | veth12: | +# | ^ * IPv4 address: 192.0.2.21, peer 192.0.2.22/32 | +# | | * IPv6 address: 2001:db8::21, peer 2001:db8::22/128 | +# | | | +# +---+-------------------------------------------------------------------+ +# | +# | Traffic type: IP or MPLS (depending on test), over UDP +# | +# +---+-------------------------------------------------------------------+ +# | | NS2 | +# | | | +# | v | +# | veth21: | +# | * IPv4 address: 192.0.2.22, peer 192.0.2.21/32 | +# | * IPv6 address: 2001:db8::22, peer 2001:db8::21/128 | +# | | +# | bareudp_ns2: | +# | * Decapsulate bareudp packets (either IP or MPLS, over UDP) | +# | received on veth21 and send the inner packets through veth23. | +# | * Encapsulate IP or MPLS packets received on veth23 into UDP | +# | and send the resulting packets through veth21. | +# | | +# | veth23: | +# | ^ * IPv4 address: 192.0.2.32, peer 192.0.2.33/32 | +# | | * IPv6 address: 2001:db8::32, peer 2001:db8::33/128 | +# | | | +# +---+-------------------------------------------------------------------+ +# | +# | Traffic type: IP or MPLS (depending on test) +# | +# +---+-------------------------------------------------------------------+ +# | | NS3 | +# | v | +# | veth32: | +# | * IPv4 address: 192.0.2.33, peer 192.0.2.32/32 | +# | * IPv6 address: 2001:db8::33, peer 2001:db8::32/128 | +# | | +# | lo: | +# | * IPv4 address: 192.0.2.103/32 | +# | * IPv6 address: 2001:db8::103/128 | +# | * IPv6 address: 2001:db8::203/128 | +# | * IPv4 route: 192.0.2.100/32 reachable via 192.0.2.32 | +# | * IPv6 route: 2001:db8::100/128 reachable via 2001:db8::32 | +# | * IPv6 route: 2001:db8::200/128 reachable via 2001:db8::32 | +# | (encapsulated with MPLS label 200) | +# | | +# +-----------------------------------------------------------------------+ + +ERR=4 # Return 4 by default, which is the SKIP code for kselftest +PING6="ping" +PAUSE_ON_FAIL="no" + +readonly NS0=$(mktemp -u ns0-XXXXXXXX) +readonly NS1=$(mktemp -u ns1-XXXXXXXX) +readonly NS2=$(mktemp -u ns2-XXXXXXXX) +readonly NS3=$(mktemp -u ns3-XXXXXXXX) + +# Exit the script after having removed the network namespaces it created +# +# Parameters: +# +# * The list of network namespaces to delete before exiting. +# +exit_cleanup() +{ + for ns in "$@"; do + ip netns delete "${ns}" 2>/dev/null || true + done + + if [ "${ERR}" -eq 4 ]; then + echo "Error: Setting up the testing environment failed." >&2 + fi + + exit "${ERR}" +} + +# Create the four network namespaces used by the script (NS0, NS1, NS2 and NS3) +# +# New namespaces are cleaned up manually in case of error, to ensure that only +# namespaces created by this script are deleted. +create_namespaces() +{ + ip netns add "${NS0}" || exit_cleanup + ip netns add "${NS1}" || exit_cleanup "${NS0}" + ip netns add "${NS2}" || exit_cleanup "${NS0}" "${NS1}" + ip netns add "${NS3}" || exit_cleanup "${NS0}" "${NS1}" "${NS2}" +} + +# The trap function handler +# +exit_cleanup_all() +{ + exit_cleanup "${NS0}" "${NS1}" "${NS2}" "${NS3}" +} + +# Configure a network interface using a host route +# +# Parameters +# +# * $1: the netns the network interface resides in, +# * $2: the network interface name, +# * $3: the local IPv4 address to assign to this interface, +# * $4: the IPv4 address of the remote network interface, +# * $5: the local IPv6 address to assign to this interface, +# * $6: the IPv6 address of the remote network interface. +# +iface_config() +{ + local NS="${1}"; readonly NS + local DEV="${2}"; readonly DEV + local LOCAL_IP4="${3}"; readonly LOCAL_IP4 + local PEER_IP4="${4}"; readonly PEER_IP4 + local LOCAL_IP6="${5}"; readonly LOCAL_IP6 + local PEER_IP6="${6}"; readonly PEER_IP6 + + ip -netns "${NS}" link set dev "${DEV}" up + ip -netns "${NS}" address add dev "${DEV}" "${LOCAL_IP4}" peer "${PEER_IP4}" + ip -netns "${NS}" address add dev "${DEV}" "${LOCAL_IP6}" peer "${PEER_IP6}" nodad +} + +# Create base networking topology: +# +# * set up the loopback device in all network namespaces (NS0..NS3), +# * set up a veth pair to connect each netns in sequence (NS0 with NS1, +# NS1 with NS2, etc.), +# * add and IPv4 and an IPv6 address on each veth interface, +# * prepare the ingress qdiscs in the intermediate namespaces. +# +setup_underlay() +{ + for ns in "${NS0}" "${NS1}" "${NS2}" "${NS3}"; do + ip -netns "${ns}" link set dev lo up + done; + + ip link add name veth01 netns "${NS0}" type veth peer name veth10 netns "${NS1}" + ip link add name veth12 netns "${NS1}" type veth peer name veth21 netns "${NS2}" + ip link add name veth23 netns "${NS2}" type veth peer name veth32 netns "${NS3}" + iface_config "${NS0}" veth01 192.0.2.10 192.0.2.11/32 2001:db8::10 2001:db8::11/128 + iface_config "${NS1}" veth10 192.0.2.11 192.0.2.10/32 2001:db8::11 2001:db8::10/128 + iface_config "${NS1}" veth12 192.0.2.21 192.0.2.22/32 2001:db8::21 2001:db8::22/128 + iface_config "${NS2}" veth21 192.0.2.22 192.0.2.21/32 2001:db8::22 2001:db8::21/128 + iface_config "${NS2}" veth23 192.0.2.32 192.0.2.33/32 2001:db8::32 2001:db8::33/128 + iface_config "${NS3}" veth32 192.0.2.33 192.0.2.32/32 2001:db8::33 2001:db8::32/128 + + tc -netns "${NS1}" qdisc add dev veth10 ingress + tc -netns "${NS2}" qdisc add dev veth23 ingress +} + +# Set up the IPv4, IPv6 and MPLS overlays. +# +# Configuration is similar for all protocols: +# +# * add an overlay IP address on the loopback interface of each edge +# namespace, +# * route these IP addresses via the intermediate namespaces (for the MPLS +# tests, this is also where MPLS encapsulation is done), +# * add routes for these IP addresses (or MPLS labels) in the intermediate +# namespaces. +# +# The bareudp encapsulation isn't configured in setup_overlay_*(). That will be +# done just before running the reachability tests. + +setup_overlay_ipv4() +{ + # Add the overlay IP addresses and route them through the veth devices + ip -netns "${NS0}" address add 192.0.2.100/32 dev lo + ip -netns "${NS3}" address add 192.0.2.103/32 dev lo + ip -netns "${NS0}" route add 192.0.2.103/32 src 192.0.2.100 via 192.0.2.11 + ip -netns "${NS3}" route add 192.0.2.100/32 src 192.0.2.103 via 192.0.2.32 + + # Route the overlay addresses in the intermediate namespaces + # (used after bareudp decapsulation) + ip netns exec "${NS1}" sysctl -qw net.ipv4.ip_forward=1 + ip netns exec "${NS2}" sysctl -qw net.ipv4.ip_forward=1 + ip -netns "${NS1}" route add 192.0.2.100/32 via 192.0.2.10 + ip -netns "${NS2}" route add 192.0.2.103/32 via 192.0.2.33 + + # The intermediate namespaces don't have routes for the reverse path, + # as it will be handled by tc. So we need to ensure that rp_filter is + # not going to block the traffic. + ip netns exec "${NS1}" sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec "${NS2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec "${NS1}" sysctl -qw net.ipv4.conf.default.rp_filter=0 + ip netns exec "${NS2}" sysctl -qw net.ipv4.conf.default.rp_filter=0 +} + +setup_overlay_ipv6() +{ + # Add the overlay IP addresses and route them through the veth devices + ip -netns "${NS0}" address add 2001:db8::100/128 dev lo + ip -netns "${NS3}" address add 2001:db8::103/128 dev lo + ip -netns "${NS0}" route add 2001:db8::103/128 src 2001:db8::100 via 2001:db8::11 + ip -netns "${NS3}" route add 2001:db8::100/128 src 2001:db8::103 via 2001:db8::32 + + # Route the overlay addresses in the intermediate namespaces + # (used after bareudp decapsulation) + ip netns exec "${NS1}" sysctl -qw net.ipv6.conf.all.forwarding=1 + ip netns exec "${NS2}" sysctl -qw net.ipv6.conf.all.forwarding=1 + ip -netns "${NS1}" route add 2001:db8::100/128 via 2001:db8::10 + ip -netns "${NS2}" route add 2001:db8::103/128 via 2001:db8::33 +} + +setup_overlay_mpls() +{ + # Add specific overlay IP addresses, routed over MPLS + ip -netns "${NS0}" address add 2001:db8::200/128 dev lo + ip -netns "${NS3}" address add 2001:db8::203/128 dev lo + ip -netns "${NS0}" route add 2001:db8::203/128 src 2001:db8::200 encap mpls 203 via 2001:db8::11 + ip -netns "${NS3}" route add 2001:db8::200/128 src 2001:db8::203 encap mpls 200 via 2001:db8::32 + + # Route the MPLS packets in the intermediate namespaces + # (used after bareudp decapsulation) + ip netns exec "${NS1}" sysctl -qw net.mpls.platform_labels=256 + ip netns exec "${NS2}" sysctl -qw net.mpls.platform_labels=256 + ip -netns "${NS1}" -family mpls route add 200 via inet6 2001:db8::10 + ip -netns "${NS2}" -family mpls route add 203 via inet6 2001:db8::33 +} + +# Run "ping" from NS0 and print the result +# +# Parameters: +# +# * $1: the variant of ping to use (normally either "ping" or "ping6"), +# * $2: the IP address to ping, +# * $3: a human readable description of the purpose of the test. +# +# If the test fails and PAUSE_ON_FAIL is active, the user is given the +# possibility to continue with the next test or to quit immediately. +# +ping_test_one() +{ + local PING="$1"; readonly PING + local IP="$2"; readonly IP + local MSG="$3"; readonly MSG + local RET + + printf "TEST: %-60s " "${MSG}" + + set +e + ip netns exec "${NS0}" "${PING}" -w 5 -c 1 "${IP}" > /dev/null 2>&1 + RET=$? + set -e + + if [ "${RET}" -eq 0 ]; then + printf "[ OK ]\n" + else + ERR=1 + printf "[FAIL]\n" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + printf "\nHit enter to continue, 'q' to quit\n" + read a + if [ "$a" = "q" ]; then + exit 1 + fi + fi + fi +} + +# Run reachability tests +# +# Parameters: +# +# * $1: human readable string describing the underlay protocol. +# +# $IPV4, $IPV6, $MPLS_UC and $MULTIPROTO are inherited from the calling +# function. +# +ping_test() +{ + local UNDERLAY="$1"; readonly UNDERLAY + local MODE + local MSG + + if [ "${MULTIPROTO}" = "multiproto" ]; then + MODE=" (multiproto mode)" + else + MODE="" + fi + + if [ $IPV4 ]; then + ping_test_one "ping" "192.0.2.103" "IPv4 packets over ${UNDERLAY}${MODE}" + fi + if [ $IPV6 ]; then + ping_test_one "${PING6}" "2001:db8::103" "IPv6 packets over ${UNDERLAY}${MODE}" + fi + if [ $MPLS_UC ]; then + ping_test_one "${PING6}" "2001:db8::203" "Unicast MPLS packets over ${UNDERLAY}${MODE}" + fi +} + +# Set up a bareudp overlay and run reachability tests over IPv4 and IPv6 +# +# Parameters: +# +# * $1: the packet type (protocol) to be handled by bareudp, +# * $2: a flag to activate or deactivate bareudp's "multiproto" mode. +# +test_overlay() +{ + local ETHERTYPE="$1"; readonly ETHERTYPE + local MULTIPROTO="$2"; readonly MULTIPROTO + local IPV4 + local IPV6 + local MPLS_UC + + case "${ETHERTYPE}" in + "ipv4") + IPV4="ipv4" + if [ "${MULTIPROTO}" = "multiproto" ]; then + IPV6="ipv6" + else + IPV6="" + fi + MPLS_UC="" + ;; + "ipv6") + IPV6="ipv6" + IPV4="" + MPLS_UC="" + ;; + "mpls_uc") + MPLS_UC="mpls_uc" + IPV4="" + IPV6="" + ;; + *) + exit 1 + ;; + esac + readonly IPV4 + readonly IPV6 + readonly MPLS_UC + + # Create the bareudp devices in the intermediate namespaces + ip -netns "${NS1}" link add name bareudp_ns1 up type bareudp dstport 6635 ethertype "${ETHERTYPE}" "${MULTIPROTO}" + ip -netns "${NS2}" link add name bareudp_ns2 up type bareudp dstport 6635 ethertype "${ETHERTYPE}" "${MULTIPROTO}" + + # IPv4 over UDPv4 + if [ $IPV4 ]; then + # Encapsulation instructions for bareudp over IPv4 + tc -netns "${NS1}" filter add dev veth10 ingress protocol ipv4 \ + flower dst_ip 192.0.2.103/32 \ + action tunnel_key set src_ip 192.0.2.21 dst_ip 192.0.2.22 id 0 \ + action mirred egress redirect dev bareudp_ns1 + tc -netns "${NS2}" filter add dev veth23 ingress protocol ipv4 \ + flower dst_ip 192.0.2.100/32 \ + action tunnel_key set src_ip 192.0.2.22 dst_ip 192.0.2.21 id 0 \ + action mirred egress redirect dev bareudp_ns2 + fi + + # IPv6 over UDPv4 + if [ $IPV6 ]; then + # Encapsulation instructions for bareudp over IPv4 + tc -netns "${NS1}" filter add dev veth10 ingress protocol ipv6 \ + flower dst_ip 2001:db8::103/128 \ + action tunnel_key set src_ip 192.0.2.21 dst_ip 192.0.2.22 id 0 \ + action mirred egress redirect dev bareudp_ns1 + tc -netns "${NS2}" filter add dev veth23 ingress protocol ipv6 \ + flower dst_ip 2001:db8::100/128 \ + action tunnel_key set src_ip 192.0.2.22 dst_ip 192.0.2.21 id 0 \ + action mirred egress redirect dev bareudp_ns2 + fi + + # MPLS (unicast) over UDPv4 + if [ $MPLS_UC ]; then + ip netns exec "${NS1}" sysctl -qw net.mpls.conf.bareudp_ns1.input=1 + ip netns exec "${NS2}" sysctl -qw net.mpls.conf.bareudp_ns2.input=1 + + # Encapsulation instructions for bareudp over IPv4 + tc -netns "${NS1}" filter add dev veth10 ingress protocol mpls_uc \ + flower mpls_label 203 \ + action tunnel_key set src_ip 192.0.2.21 dst_ip 192.0.2.22 id 0 \ + action mirred egress redirect dev bareudp_ns1 + tc -netns "${NS2}" filter add dev veth23 ingress protocol mpls_uc \ + flower mpls_label 200 \ + action tunnel_key set src_ip 192.0.2.22 dst_ip 192.0.2.21 id 0 \ + action mirred egress redirect dev bareudp_ns2 + fi + + # Test IPv4 underlay + ping_test "UDPv4" + + # Cleanup bareudp encapsulation instructions, as they were specific to + # the IPv4 underlay, before setting up and testing the IPv6 underlay + tc -netns "${NS1}" filter delete dev veth10 ingress + tc -netns "${NS2}" filter delete dev veth23 ingress + + # IPv4 over UDPv6 + if [ $IPV4 ]; then + # New encapsulation instructions for bareudp over IPv6 + tc -netns "${NS1}" filter add dev veth10 ingress protocol ipv4 \ + flower dst_ip 192.0.2.103/32 \ + action tunnel_key set src_ip 2001:db8::21 dst_ip 2001:db8::22 id 0 \ + action mirred egress redirect dev bareudp_ns1 + tc -netns "${NS2}" filter add dev veth23 ingress protocol ipv4 \ + flower dst_ip 192.0.2.100/32 \ + action tunnel_key set src_ip 2001:db8::22 dst_ip 2001:db8::21 id 0 \ + action mirred egress redirect dev bareudp_ns2 + fi + + # IPv6 over UDPv6 + if [ $IPV6 ]; then + # New encapsulation instructions for bareudp over IPv6 + tc -netns "${NS1}" filter add dev veth10 ingress protocol ipv6 \ + flower dst_ip 2001:db8::103/128 \ + action tunnel_key set src_ip 2001:db8::21 dst_ip 2001:db8::22 id 0 \ + action mirred egress redirect dev bareudp_ns1 + tc -netns "${NS2}" filter add dev veth23 ingress protocol ipv6 \ + flower dst_ip 2001:db8::100/128 \ + action tunnel_key set src_ip 2001:db8::22 dst_ip 2001:db8::21 id 0 \ + action mirred egress redirect dev bareudp_ns2 + fi + + # MPLS (unicast) over UDPv6 + if [ $MPLS_UC ]; then + # New encapsulation instructions for bareudp over IPv6 + tc -netns "${NS1}" filter add dev veth10 ingress protocol mpls_uc \ + flower mpls_label 203 \ + action tunnel_key set src_ip 2001:db8::21 dst_ip 2001:db8::22 id 0 \ + action mirred egress redirect dev bareudp_ns1 + tc -netns "${NS2}" filter add dev veth23 ingress protocol mpls_uc \ + flower mpls_label 200 \ + action tunnel_key set src_ip 2001:db8::22 dst_ip 2001:db8::21 id 0 \ + action mirred egress redirect dev bareudp_ns2 + fi + + # Test IPv6 underlay + ping_test "UDPv6" + + tc -netns "${NS1}" filter delete dev veth10 ingress + tc -netns "${NS2}" filter delete dev veth23 ingress + ip -netns "${NS1}" link delete bareudp_ns1 + ip -netns "${NS2}" link delete bareudp_ns2 +} + +check_features() +{ + ip link help 2>&1 | grep -q bareudp + if [ $? -ne 0 ]; then + echo "Missing bareudp support in iproute2" >&2 + exit_cleanup + fi + + # Use ping6 on systems where ping doesn't handle IPv6 + ping -w 1 -c 1 ::1 > /dev/null 2>&1 || PING6="ping6" +} + +usage() +{ + echo "Usage: $0 [-p]" + exit 1 +} + +while getopts :p o +do + case $o in + p) PAUSE_ON_FAIL="yes";; + *) usage;; + esac +done + +check_features + +# Create namespaces before setting up the exit trap. +# Otherwise, exit_cleanup_all() could delete namespaces that were not created +# by this script. +create_namespaces + +set -e +trap exit_cleanup_all EXIT + +setup_underlay +setup_overlay_ipv4 +setup_overlay_ipv6 +setup_overlay_mpls + +test_overlay ipv4 nomultiproto +test_overlay ipv6 nomultiproto +test_overlay ipv4 multiproto +test_overlay mpls_uc nomultiproto + +if [ "${ERR}" -eq 1 ]; then + echo "Some tests failed." >&2 +else + ERR=0 +fi diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 4d5df8e1eee7..614d5477365a 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -34,3 +34,10 @@ CONFIG_TRACEPOINTS=y CONFIG_NET_DROP_MONITOR=m CONFIG_NETDEVSIM=m CONFIG_NET_FOU=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_MIRRED=m +CONFIG_BAREUDP=m diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index fb5c55dd6df8..02b0b9ead40b 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -256,6 +256,28 @@ setup_cmd_nsb() fi } +setup_cmd_nsc() +{ + local cmd="$*" + local rc + + run_cmd_nsc ${cmd} + rc=$? + if [ $rc -ne 0 ]; then + # show user the command if not done so already + if [ "$VERBOSE" = "0" ]; then + echo "setup command: $cmd" + fi + echo "failed. stopping tests" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue" + read a + fi + exit $rc + fi +} + # set sysctl values in NS-A set_sysctl() { @@ -471,6 +493,36 @@ setup() sleep 1 } +setup_lla_only() +{ + # make sure we are starting with a clean slate + kill_procs + cleanup 2>/dev/null + + log_debug "Configuring network namespaces" + set -e + + create_ns ${NSA} "-" "-" + create_ns ${NSB} "-" "-" + create_ns ${NSC} "-" "-" + connect_ns ${NSA} ${NSA_DEV} "-" "-" \ + ${NSB} ${NSB_DEV} "-" "-" + connect_ns ${NSA} ${NSA_DEV2} "-" "-" \ + ${NSC} ${NSC_DEV} "-" "-" + + NSA_LINKIP6=$(get_linklocal ${NSA} ${NSA_DEV}) + NSB_LINKIP6=$(get_linklocal ${NSB} ${NSB_DEV}) + NSC_LINKIP6=$(get_linklocal ${NSC} ${NSC_DEV}) + + create_vrf ${NSA} ${VRF} ${VRF_TABLE} "-" "-" + ip -netns ${NSA} link set dev ${NSA_DEV} vrf ${VRF} + ip -netns ${NSA} link set dev ${NSA_DEV2} vrf ${VRF} + + set +e + + sleep 1 +} + ################################################################################ # IPv4 @@ -3787,10 +3839,53 @@ use_case_br() setup_cmd_nsb ip li del vlan100 2>/dev/null } +# VRF only. +# ns-A device is connected to both ns-B and ns-C on a single VRF but only has +# LLA on the interfaces +use_case_ping_lla_multi() +{ + setup_lla_only + # only want reply from ns-A + setup_cmd_nsb sysctl -qw net.ipv6.icmp.echo_ignore_multicast=1 + setup_cmd_nsc sysctl -qw net.ipv6.icmp.echo_ignore_multicast=1 + + log_start + run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV} + log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Pre cycle, ping out ns-B" + + run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV} + log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Pre cycle, ping out ns-C" + + # cycle/flap the first ns-A interface + setup_cmd ip link set ${NSA_DEV} down + setup_cmd ip link set ${NSA_DEV} up + sleep 1 + + log_start + run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV} + log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV}, ping out ns-B" + run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV} + log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV}, ping out ns-C" + + # cycle/flap the second ns-A interface + setup_cmd ip link set ${NSA_DEV2} down + setup_cmd ip link set ${NSA_DEV2} up + sleep 1 + + log_start + run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV} + log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV2}, ping out ns-B" + run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV} + log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV2}, ping out ns-C" +} + use_cases() { log_section "Use cases" + log_subsection "Device enslaved to bridge" use_case_br + log_subsection "Ping LLA with multiple interfaces" + use_case_ping_lla_multi } ################################################################################ diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 250fbb2d1625..d97bd6889446 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -48,6 +48,7 @@ TEST_PROGS = bridge_igmp.sh \ tc_chains.sh \ tc_flower_router.sh \ tc_flower.sh \ + tc_mpls_l2vpn.sh \ tc_shblocks.sh \ tc_vlan_modify.sh \ vxlan_asymmetric.sh \ diff --git a/tools/testing/selftests/net/forwarding/bridge_igmp.sh b/tools/testing/selftests/net/forwarding/bridge_igmp.sh index 88d2472ba151..675eff45b037 100755 --- a/tools/testing/selftests/net/forwarding/bridge_igmp.sh +++ b/tools/testing/selftests/net/forwarding/bridge_igmp.sh @@ -1,11 +1,37 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="reportleave_test" +ALL_TESTS="v2reportleave_test v3include_test v3inc_allow_test v3inc_is_include_test \ + v3inc_is_exclude_test v3inc_to_exclude_test v3exc_allow_test v3exc_is_include_test \ + v3exc_is_exclude_test v3exc_to_exclude_test v3inc_block_test v3exc_block_test \ + v3exc_timeout_test v3star_ex_auto_add_test" NUM_NETIFS=4 CHECK_TC="yes" TEST_GROUP="239.10.10.10" TEST_GROUP_MAC="01:00:5e:0a:0a:0a" + +ALL_GROUP="224.0.0.1" +ALL_MAC="01:00:5e:00:00:01" + +# IGMPv3 is_in report: grp 239.10.10.10 is_include 192.0.2.1,192.0.2.2,192.0.2.3 +MZPKT_IS_INC="22:00:9d:de:00:00:00:01:01:00:00:03:ef:0a:0a:0a:c0:00:02:01:c0:00:02:02:c0:00:02:03" +# IGMPv3 is_in report: grp 239.10.10.10 is_include 192.0.2.10,192.0.2.11,192.0.2.12 +MZPKT_IS_INC2="22:00:9d:c3:00:00:00:01:01:00:00:03:ef:0a:0a:0a:c0:00:02:0a:c0:00:02:0b:c0:00:02:0c" +# IGMPv3 is_in report: grp 239.10.10.10 is_include 192.0.2.20,192.0.2.30 +MZPKT_IS_INC3="22:00:5f:b4:00:00:00:01:01:00:00:02:ef:0a:0a:0a:c0:00:02:14:c0:00:02:1e" +# IGMPv3 allow report: grp 239.10.10.10 allow 192.0.2.10,192.0.2.11,192.0.2.12 +MZPKT_ALLOW="22:00:99:c3:00:00:00:01:05:00:00:03:ef:0a:0a:0a:c0:00:02:0a:c0:00:02:0b:c0:00:02:0c" +# IGMPv3 allow report: grp 239.10.10.10 allow 192.0.2.20,192.0.2.30 +MZPKT_ALLOW2="22:00:5b:b4:00:00:00:01:05:00:00:02:ef:0a:0a:0a:c0:00:02:14:c0:00:02:1e" +# IGMPv3 is_ex report: grp 239.10.10.10 is_exclude 192.0.2.1,192.0.2.2,192.0.2.20,192.0.2.21 +MZPKT_IS_EXC="22:00:da:b6:00:00:00:01:02:00:00:04:ef:0a:0a:0a:c0:00:02:01:c0:00:02:02:c0:00:02:14:c0:00:02:15" +# IGMPv3 is_ex report: grp 239.10.10.10 is_exclude 192.0.2.20,192.0.2.30 +MZPKT_IS_EXC2="22:00:5e:b4:00:00:00:01:02:00:00:02:ef:0a:0a:0a:c0:00:02:14:c0:00:02:1e" +# IGMPv3 to_ex report: grp 239.10.10.10 to_exclude 192.0.2.1,192.0.2.20,192.0.2.30 +MZPKT_TO_EXC="22:00:9a:b1:00:00:00:01:04:00:00:03:ef:0a:0a:0a:c0:00:02:01:c0:00:02:14:c0:00:02:1e" +# IGMPv3 block report: grp 239.10.10.10 block 192.0.2.1,192.0.2.20,192.0.2.30 +MZPKT_BLOCK="22:00:98:b1:00:00:00:01:06:00:00:03:ef:0a:0a:0a:c0:00:02:01:c0:00:02:14:c0:00:02:1e" + source lib.sh h1_create() @@ -79,38 +105,7 @@ cleanup() vrf_cleanup } -# return 0 if the packet wasn't seen on host2_if or 1 if it was -mcast_packet_test() -{ - local mac=$1 - local ip=$2 - local host1_if=$3 - local host2_if=$4 - local seen=0 - - # Add an ACL on `host2_if` which will tell us whether the packet - # was received by it or not. - tc qdisc add dev $host2_if ingress - tc filter add dev $host2_if ingress protocol ip pref 1 handle 101 \ - flower dst_mac $mac action drop - - $MZ $host1_if -c 1 -p 64 -b $mac -B $ip -t udp "dp=4096,sp=2048" -q - sleep 1 - - tc -j -s filter show dev $host2_if ingress \ - | jq -e ".[] | select(.options.handle == 101) \ - | select(.options.actions[0].stats.packets == 1)" &> /dev/null - if [[ $? -eq 0 ]]; then - seen=1 - fi - - tc filter del dev $host2_if ingress protocol ip pref 1 handle 101 flower - tc qdisc del dev $host2_if ingress - - return $seen -} - -reportleave_test() +v2reportleave_test() { RET=0 ip address add dev $h2 $TEST_GROUP/32 autojoin @@ -118,12 +113,12 @@ reportleave_test() sleep 5 bridge mdb show dev br0 | grep $TEST_GROUP 1>/dev/null - check_err $? "Report didn't create mdb entry for $TEST_GROUP" + check_err $? "IGMPv2 report didn't create mdb entry for $TEST_GROUP" - mcast_packet_test $TEST_GROUP_MAC $TEST_GROUP $h1 $h2 + mcast_packet_test $TEST_GROUP_MAC 192.0.2.1 $TEST_GROUP $h1 $h2 check_fail $? "Traffic to $TEST_GROUP wasn't forwarded" - log_test "IGMP report $TEST_GROUP" + log_test "IGMPv2 report $TEST_GROUP" RET=0 bridge mdb show dev br0 | grep $TEST_GROUP 1>/dev/null @@ -136,10 +131,424 @@ reportleave_test() bridge mdb show dev br0 | grep $TEST_GROUP 1>/dev/null check_fail $? "Leave didn't delete mdb entry for $TEST_GROUP" - mcast_packet_test $TEST_GROUP_MAC $TEST_GROUP $h1 $h2 + mcast_packet_test $TEST_GROUP_MAC 192.0.2.1 $TEST_GROUP $h1 $h2 check_err $? "Traffic to $TEST_GROUP was forwarded without mdb entry" - log_test "IGMP leave $TEST_GROUP" + log_test "IGMPv2 leave $TEST_GROUP" +} + +v3include_prepare() +{ + local host1_if=$1 + local mac=$2 + local group=$3 + local X=("192.0.2.1" "192.0.2.2" "192.0.2.3") + + ip link set dev br0 type bridge mcast_igmp_version 3 + check_err $? "Could not change bridge IGMP version to 3" + + $MZ $host1_if -b $mac -c 1 -B $group -t ip "proto=2,p=$MZPKT_IS_INC" -q + sleep 1 + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and .source_list != null)" &>/dev/null + check_err $? "Missing *,G entry with source list" + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and .filter_mode == \"include\")" &>/dev/null + check_err $? "Wrong *,G entry filter mode" + brmcast_check_sg_entries "is_include" "${X[@]}" +} + +v3exclude_prepare() +{ + local host1_if=$1 + local mac=$2 + local group=$3 + local pkt=$4 + local X=("192.0.2.1" "192.0.2.2") + local Y=("192.0.2.20" "192.0.2.21") + + v3include_prepare $host1_if $mac $group + + $MZ $host1_if -c 1 -b $mac -B $group -t ip "proto=2,p=$MZPKT_IS_EXC" -q + sleep 1 + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and .filter_mode == \"exclude\")" &>/dev/null + check_err $? "Wrong *,G entry filter mode" + + brmcast_check_sg_entries "is_exclude" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"192.0.2.3\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 192.0.2.3 entry still exists" +} + +v3cleanup() +{ + local port=$1 + local group=$2 + + bridge mdb del dev br0 port $port grp $group + ip link set dev br0 type bridge mcast_igmp_version 2 +} + +v3include_test() +{ + RET=0 + local X=("192.0.2.1" "192.0.2.2" "192.0.2.3") + + v3include_prepare $h1 $ALL_MAC $ALL_GROUP + + brmcast_check_sg_state 0 "${X[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" + brmcast_check_sg_fwding 0 "192.0.2.100" + + log_test "IGMPv3 report $TEST_GROUP is_include" + + v3cleanup $swp1 $TEST_GROUP +} + +v3inc_allow_test() +{ + RET=0 + local X=("192.0.2.10" "192.0.2.11" "192.0.2.12") + + v3include_prepare $h1 $ALL_MAC $ALL_GROUP + + $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_ALLOW" -q + sleep 1 + brmcast_check_sg_entries "allow" "${X[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" + brmcast_check_sg_fwding 0 "192.0.2.100" + + log_test "IGMPv3 report $TEST_GROUP include -> allow" + + v3cleanup $swp1 $TEST_GROUP +} + +v3inc_is_include_test() +{ + RET=0 + local X=("192.0.2.10" "192.0.2.11" "192.0.2.12") + + v3include_prepare $h1 $ALL_MAC $ALL_GROUP + + $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_IS_INC2" -q + sleep 1 + brmcast_check_sg_entries "is_include" "${X[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" + brmcast_check_sg_fwding 0 "192.0.2.100" + + log_test "IGMPv3 report $TEST_GROUP include -> is_include" + + v3cleanup $swp1 $TEST_GROUP +} + +v3inc_is_exclude_test() +{ + RET=0 + + v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP + + brmcast_check_sg_fwding 1 "${X[@]}" 192.0.2.100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "IGMPv3 report $TEST_GROUP include -> is_exclude" + + v3cleanup $swp1 $TEST_GROUP +} + +v3inc_to_exclude_test() +{ + RET=0 + local X=("192.0.2.1") + local Y=("192.0.2.20" "192.0.2.30") + + v3include_prepare $h1 $ALL_MAC $ALL_GROUP + + ip link set dev br0 type bridge mcast_last_member_interval 500 + check_err $? "Could not change mcast_last_member_interval to 5s" + + $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_TO_EXC" -q + sleep 1 + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and .filter_mode == \"exclude\")" &>/dev/null + check_err $? "Wrong *,G entry filter mode" + + brmcast_check_sg_entries "to_exclude" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"192.0.2.2\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 192.0.2.2 entry still exists" + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"192.0.2.21\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 192.0.2.21 entry still exists" + + brmcast_check_sg_fwding 1 "${X[@]}" 192.0.2.100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "IGMPv3 report $TEST_GROUP include -> to_exclude" + + ip link set dev br0 type bridge mcast_last_member_interval 100 + + v3cleanup $swp1 $TEST_GROUP +} + +v3exc_allow_test() +{ + RET=0 + local X=("192.0.2.1" "192.0.2.2" "192.0.2.20" "192.0.2.30") + local Y=("192.0.2.21") + + v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP + + $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_ALLOW2" -q + sleep 1 + brmcast_check_sg_entries "allow" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" 192.0.2.100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "IGMPv3 report $TEST_GROUP exclude -> allow" + + v3cleanup $swp1 $TEST_GROUP +} + +v3exc_is_include_test() +{ + RET=0 + local X=("192.0.2.1" "192.0.2.2" "192.0.2.20" "192.0.2.30") + local Y=("192.0.2.21") + + v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP + + $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_IS_INC3" -q + sleep 1 + brmcast_check_sg_entries "is_include" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" 192.0.2.100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "IGMPv3 report $TEST_GROUP exclude -> is_include" + + v3cleanup $swp1 $TEST_GROUP +} + +v3exc_is_exclude_test() +{ + RET=0 + local X=("192.0.2.30") + local Y=("192.0.2.20") + + v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP + + $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_IS_EXC2" -q + sleep 1 + brmcast_check_sg_entries "is_exclude" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" 192.0.2.100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "IGMPv3 report $TEST_GROUP exclude -> is_exclude" + + v3cleanup $swp1 $TEST_GROUP +} + +v3exc_to_exclude_test() +{ + RET=0 + local X=("192.0.2.1" "192.0.2.30") + local Y=("192.0.2.20") + + v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP + + ip link set dev br0 type bridge mcast_last_member_interval 500 + check_err $? "Could not change mcast_last_member_interval to 5s" + + $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_TO_EXC" -q + sleep 1 + brmcast_check_sg_entries "to_exclude" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" 192.0.2.100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "IGMPv3 report $TEST_GROUP exclude -> to_exclude" + + ip link set dev br0 type bridge mcast_last_member_interval 100 + + v3cleanup $swp1 $TEST_GROUP +} + +v3inc_block_test() +{ + RET=0 + local X=("192.0.2.2" "192.0.2.3") + + v3include_prepare $h1 $ALL_MAC $ALL_GROUP + + $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_BLOCK" -q + # make sure the lowered timers have expired (by default 2 seconds) + sleep 3 + brmcast_check_sg_entries "block" "${X[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"192.0.2.1\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 192.0.2.1 entry still exists" + + brmcast_check_sg_fwding 1 "${X[@]}" + brmcast_check_sg_fwding 0 "192.0.2.100" + + log_test "IGMPv3 report $TEST_GROUP include -> block" + + v3cleanup $swp1 $TEST_GROUP +} + +v3exc_block_test() +{ + RET=0 + local X=("192.0.2.1" "192.0.2.2" "192.0.2.30") + local Y=("192.0.2.20" "192.0.2.21") + + v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP + + ip link set dev br0 type bridge mcast_last_member_interval 500 + check_err $? "Could not change mcast_last_member_interval to 5s" + + $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_BLOCK" -q + sleep 1 + brmcast_check_sg_entries "block" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" 192.0.2.100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "IGMPv3 report $TEST_GROUP exclude -> block" + + ip link set dev br0 type bridge mcast_last_member_interval 100 + + v3cleanup $swp1 $TEST_GROUP +} + +v3exc_timeout_test() +{ + RET=0 + local X=("192.0.2.20" "192.0.2.30") + + # GMI should be 3 seconds + ip link set dev br0 type bridge mcast_query_interval 100 mcast_query_response_interval 100 + + v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP + ip link set dev br0 type bridge mcast_query_interval 500 mcast_query_response_interval 500 + $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_ALLOW2" -q + sleep 3 + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and .filter_mode == \"include\")" &>/dev/null + check_err $? "Wrong *,G entry filter mode" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"192.0.2.1\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 192.0.2.1 entry still exists" + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"192.0.2.2\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 192.0.2.2 entry still exists" + + brmcast_check_sg_entries "allow" "${X[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" + brmcast_check_sg_fwding 0 192.0.2.100 + + log_test "IGMPv3 group $TEST_GROUP exclude timeout" + + ip link set dev br0 type bridge mcast_query_interval 12500 \ + mcast_query_response_interval 1000 + + v3cleanup $swp1 $TEST_GROUP +} + +v3star_ex_auto_add_test() +{ + RET=0 + + v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP + + $MZ $h2 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_IS_INC" -q + sleep 1 + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and .src == \"192.0.2.3\" and \ + .port == \"$swp1\")" &>/dev/null + check_err $? "S,G entry for *,G port doesn't exist" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and .src == \"192.0.2.3\" and \ + .port == \"$swp1\" and \ + .flags[] == \"added_by_star_ex\")" &>/dev/null + check_err $? "Auto-added S,G entry doesn't have added_by_star_ex flag" + + brmcast_check_sg_fwding 1 192.0.2.3 + + log_test "IGMPv3 S,G port entry automatic add to a *,G port" + + v3cleanup $swp1 $TEST_GROUP + v3cleanup $swp2 $TEST_GROUP } trap cleanup EXIT diff --git a/tools/testing/selftests/net/forwarding/bridge_mld.sh b/tools/testing/selftests/net/forwarding/bridge_mld.sh new file mode 100755 index 000000000000..ffdcfa87ca2b --- /dev/null +++ b/tools/testing/selftests/net/forwarding/bridge_mld.sh @@ -0,0 +1,558 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS="mldv2include_test mldv2inc_allow_test mldv2inc_is_include_test mldv2inc_is_exclude_test \ + mldv2inc_to_exclude_test mldv2exc_allow_test mldv2exc_is_include_test \ + mldv2exc_is_exclude_test mldv2exc_to_exclude_test mldv2inc_block_test \ + mldv2exc_block_test mldv2exc_timeout_test mldv2star_ex_auto_add_test" +NUM_NETIFS=4 +CHECK_TC="yes" +TEST_GROUP="ff02::cc" +TEST_GROUP_MAC="33:33:00:00:00:cc" + +# MLDv2 is_in report: grp ff02::cc is_include 2001:db8:1::1,2001:db8:1::2,2001:db8:1::3 +MZPKT_IS_INC="33:33:00:00:00:01:fe:54:00:04:5e:ba:86:dd:60:0a:2d:ae:00:54:00:01:fe:80:00:\ +00:00:00:00:00:fc:54:00:ff:fe:04:5e:ba:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:01:3a:\ +00:05:02:00:00:00:00:8f:00:8e:d9:00:00:00:01:01:00:00:03:ff:02:00:00:00:00:00:00:00:00:00:\ +00:00:00:00:cc:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:01:20:01:0d:b8:00:01:00:00:00:\ +00:00:00:00:00:00:02:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:03" +# MLDv2 is_in report: grp ff02::cc is_include 2001:db8:1::10,2001:db8:1::11,2001:db8:1::12 +MZPKT_IS_INC2="33:33:00:00:00:01:fe:54:00:04:5e:ba:86:dd:60:0a:2d:ae:00:54:00:01:fe:80:00:\ +00:00:00:00:00:fc:54:00:ff:fe:04:5e:ba:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:01:3a:00:\ +05:02:00:00:00:00:8f:00:8e:ac:00:00:00:01:01:00:00:03:ff:02:00:00:00:00:00:00:00:00:00:00:00:\ +00:00:cc:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:10:20:01:0d:b8:00:01:00:00:00:00:00:00:\ +00:00:00:11:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:12" +# MLDv2 is_in report: grp ff02::cc is_include 2001:db8:1::20,2001:db8:1::30 +MZPKT_IS_INC3="33:33:00:00:00:01:fe:54:00:04:5e:ba:86:dd:60:0a:2d:ae:00:44:00:01:fe:80:00:00:00:\ +00:00:00:fc:54:00:ff:fe:04:5e:ba:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:01:3a:00:05:02:00:\ +00:00:00:8f:00:bc:5a:00:00:00:01:01:00:00:02:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:cc:20:\ +01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:20:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:30" +# MLDv2 allow report: grp ff02::cc allow 2001:db8:1::10,2001:db8:1::11,2001:db8:1::12 +MZPKT_ALLOW="33:33:00:00:00:01:fe:54:00:04:5e:ba:86:dd:60:0a:2d:ae:00:54:00:01:fe:80:00:00:\ +00:00:00:00:fc:54:00:ff:fe:04:5e:ba:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:01:3a:00:05:\ +02:00:00:00:00:8f:00:8a:ac:00:00:00:01:05:00:00:03:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:\ +00:cc:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:10:20:01:0d:b8:00:01:00:00:00:00:00:00:00:\ +00:00:11:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:12" +# MLDv2 allow report: grp ff02::cc allow 2001:db8:1::20,2001:db8:1::30 +MZPKT_ALLOW2="33:33:00:00:00:01:fe:54:00:04:5e:ba:86:dd:60:0a:2d:ae:00:44:00:01:fe:80:00:00:00:\ +00:00:00:fc:54:00:ff:fe:04:5e:ba:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:01:3a:00:05:02:00:\ +00:00:00:8f:00:b8:5a:00:00:00:01:05:00:00:02:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:cc:20:\ +01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:20:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:30" +# MLDv2 is_ex report: grp ff02::cc is_exclude 2001:db8:1::1,2001:db8:1::2,2001:db8:1::20,2001:db8:1::21 +MZPKT_IS_EXC="33:33:00:00:00:01:fe:54:00:04:5e:ba:86:dd:60:0a:2d:ae:00:64:00:01:fe:80:00:00:00:\ +00:00:00:fc:54:00:ff:fe:04:5e:ba:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:01:3a:00:05:02:00:\ +00:00:00:8f:00:5f:d0:00:00:00:01:02:00:00:04:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:cc:20:\ +01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:01:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:02:20:\ +01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:20:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:21" +# MLDv2 is_ex report: grp ff02::cc is_exclude 2001:db8:1::20,2001:db8:1::30 +MZPKT_IS_EXC2="33:33:00:00:00:01:fe:54:00:04:5e:ba:86:dd:60:0a:2d:ae:00:44:00:01:fe:80:00:00:00:\ +00:00:00:fc:54:00:ff:fe:04:5e:ba:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:01:3a:00:05:02:00:\ +00:00:00:8f:00:bb:5a:00:00:00:01:02:00:00:02:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:cc:20:\ +01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:20:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:30" +# MLDv2 to_ex report: grp ff02::cc to_exclude 2001:db8:1::1,2001:db8:1::20,2001:db8:1::30 +MZPKT_TO_EXC="33:33:00:00:00:01:fe:54:00:04:5e:ba:86:dd:60:0a:2d:ae:00:54:00:01:fe:80:00:00:00:\ +00:00:00:fc:54:00:ff:fe:04:5e:ba:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:01:3a:00:05:02:00:\ +00:00:00:8f:00:8b:8e:00:00:00:01:04:00:00:03:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:cc:20:\ +01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:01:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:20:20:\ +01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:30" +# MLDv2 block report: grp ff02::cc block 2001:db8:1::1,2001:db8:1::20,2001:db8:1::30 +MZPKT_BLOCK="33:33:00:00:00:01:fe:54:00:04:5e:ba:86:dd:60:0a:2d:ae:00:54:00:01:fe:80:00:00:00:00:\ +00:00:fc:54:00:ff:fe:04:5e:ba:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:01:3a:00:05:02:00:00:\ +00:00:8f:00:89:8e:00:00:00:01:06:00:00:03:ff:02:00:00:00:00:00:00:00:00:00:00:00:00:00:cc:20:01:\ +0d:b8:00:01:00:00:00:00:00:00:00:00:00:01:20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:20:20:01:\ +0d:b8:00:01:00:00:00:00:00:00:00:00:00:30" + +source lib.sh + +h1_create() +{ + simple_if_init $h1 2001:db8:1::1/64 +} + +h1_destroy() +{ + simple_if_fini $h1 2001:db8:1::1/64 +} + +h2_create() +{ + simple_if_init $h2 2001:db8:1::2/64 +} + +h2_destroy() +{ + simple_if_fini $h2 2001:db8:1::2/64 +} + +switch_create() +{ + ip link add dev br0 type bridge mcast_snooping 1 mcast_query_response_interval 100 \ + mcast_mld_version 2 mcast_startup_query_interval 300 \ + mcast_querier 1 + + ip link set dev $swp1 master br0 + ip link set dev $swp2 master br0 + + ip link set dev br0 up + ip link set dev $swp1 up + ip link set dev $swp2 up + + # make sure a query has been generated + sleep 5 +} + +switch_destroy() +{ + ip link set dev $swp2 down + ip link set dev $swp1 down + + ip link del dev br0 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + vrf_prepare + + h1_create + h2_create + + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + + h2_destroy + h1_destroy + + vrf_cleanup +} + +mldv2include_prepare() +{ + local host1_if=$1 + local X=("2001:db8:1::1" "2001:db8:1::2" "2001:db8:1::3") + + ip link set dev br0 type bridge mcast_mld_version 2 + check_err $? "Could not change bridge MLD version to 2" + + $MZ $host1_if $MZPKT_IS_INC -q + sleep 1 + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and .source_list != null)" &>/dev/null + check_err $? "Missing *,G entry with source list" + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and .filter_mode == \"include\")" &>/dev/null + check_err $? "Wrong *,G entry filter mode" + brmcast_check_sg_entries "is_include" "${X[@]}" +} + +mldv2exclude_prepare() +{ + local host1_if=$1 + local mac=$2 + local group=$3 + local pkt=$4 + local X=("2001:db8:1::1" "2001:db8:1::2") + local Y=("2001:db8:1::20" "2001:db8:1::21") + + mldv2include_prepare $h1 + + $MZ $host1_if -c 1 $MZPKT_IS_EXC -q + sleep 1 + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and .filter_mode == \"exclude\")" &>/dev/null + check_err $? "Wrong *,G entry filter mode" + + brmcast_check_sg_entries "is_exclude" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"2001:db8:1::3\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 2001:db8:1::3 entry still exists" +} + +mldv2cleanup() +{ + local port=$1 + + bridge mdb del dev br0 port $port grp $TEST_GROUP + ip link set dev br0 type bridge mcast_mld_version 1 +} + +mldv2include_test() +{ + RET=0 + local X=("2001:db8:1::1" "2001:db8:1::2" "2001:db8:1::3") + + mldv2include_prepare $h1 + + brmcast_check_sg_state 0 "${X[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" + brmcast_check_sg_fwding 0 "2001:db8:1::100" + + log_test "MLDv2 report $TEST_GROUP is_include" + + mldv2cleanup $swp1 +} + +mldv2inc_allow_test() +{ + RET=0 + local X=("2001:db8:1::10" "2001:db8:1::11" "2001:db8:1::12") + + mldv2include_prepare $h1 + + $MZ $h1 -c 1 $MZPKT_ALLOW -q + sleep 1 + brmcast_check_sg_entries "allow" "${X[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" + brmcast_check_sg_fwding 0 "2001:db8:1::100" + + log_test "MLDv2 report $TEST_GROUP include -> allow" + + mldv2cleanup $swp1 +} + +mldv2inc_is_include_test() +{ + RET=0 + local X=("2001:db8:1::10" "2001:db8:1::11" "2001:db8:1::12") + + mldv2include_prepare $h1 + + $MZ $h1 -c 1 $MZPKT_IS_INC2 -q + sleep 1 + brmcast_check_sg_entries "is_include" "${X[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" + brmcast_check_sg_fwding 0 "2001:db8:1::100" + + log_test "MLDv2 report $TEST_GROUP include -> is_include" + + mldv2cleanup $swp1 +} + +mldv2inc_is_exclude_test() +{ + RET=0 + + mldv2exclude_prepare $h1 + + brmcast_check_sg_fwding 1 "${X[@]}" 2001:db8:1::100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "MLDv2 report $TEST_GROUP include -> is_exclude" + + mldv2cleanup $swp1 +} + +mldv2inc_to_exclude_test() +{ + RET=0 + local X=("2001:db8:1::1") + local Y=("2001:db8:1::20" "2001:db8:1::30") + + mldv2include_prepare $h1 + + ip link set dev br0 type bridge mcast_last_member_interval 500 + check_err $? "Could not change mcast_last_member_interval to 5s" + + $MZ $h1 -c 1 $MZPKT_TO_EXC -q + sleep 1 + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and .filter_mode == \"exclude\")" &>/dev/null + check_err $? "Wrong *,G entry filter mode" + + brmcast_check_sg_entries "to_exclude" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"2001:db8:1::2\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 2001:db8:1::2 entry still exists" + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"2001:db8:1::21\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 2001:db8:1::21 entry still exists" + + brmcast_check_sg_fwding 1 "${X[@]}" 2001:db8:1::100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "MLDv2 report $TEST_GROUP include -> to_exclude" + + ip link set dev br0 type bridge mcast_last_member_interval 100 + + mldv2cleanup $swp1 +} + +mldv2exc_allow_test() +{ + RET=0 + local X=("2001:db8:1::1" "2001:db8:1::2" "2001:db8:1::20" "2001:db8:1::30") + local Y=("2001:db8:1::21") + + mldv2exclude_prepare $h1 + + $MZ $h1 -c 1 $MZPKT_ALLOW2 -q + sleep 1 + brmcast_check_sg_entries "allow" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" 2001:db8:1::100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "MLDv2 report $TEST_GROUP exclude -> allow" + + mldv2cleanup $swp1 +} + +mldv2exc_is_include_test() +{ + RET=0 + local X=("2001:db8:1::1" "2001:db8:1::2" "2001:db8:1::20" "2001:db8:1::30") + local Y=("2001:db8:1::21") + + mldv2exclude_prepare $h1 + + $MZ $h1 -c 1 $MZPKT_IS_INC3 -q + sleep 1 + brmcast_check_sg_entries "is_include" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" 2001:db8:1::100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "MLDv2 report $TEST_GROUP exclude -> is_include" + + mldv2cleanup $swp1 +} + +mldv2exc_is_exclude_test() +{ + RET=0 + local X=("2001:db8:1::30") + local Y=("2001:db8:1::20") + + mldv2exclude_prepare $h1 + + $MZ $h1 -c 1 $MZPKT_IS_EXC2 -q + sleep 1 + brmcast_check_sg_entries "is_exclude" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" 2001:db8:1::100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "MLDv2 report $TEST_GROUP exclude -> is_exclude" + + mldv2cleanup $swp1 +} + +mldv2exc_to_exclude_test() +{ + RET=0 + local X=("2001:db8:1::1" "2001:db8:1::30") + local Y=("2001:db8:1::20") + + mldv2exclude_prepare $h1 + + ip link set dev br0 type bridge mcast_last_member_interval 500 + check_err $? "Could not change mcast_last_member_interval to 5s" + + $MZ $h1 -c 1 $MZPKT_TO_EXC -q + sleep 1 + brmcast_check_sg_entries "to_exclude" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" 2001:db8:1::100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "MLDv2 report $TEST_GROUP exclude -> to_exclude" + + ip link set dev br0 type bridge mcast_last_member_interval 100 + + mldv2cleanup $swp1 +} + +mldv2inc_block_test() +{ + RET=0 + local X=("2001:db8:1::2" "2001:db8:1::3") + + mldv2include_prepare $h1 + + $MZ $h1 -c 1 $MZPKT_BLOCK -q + # make sure the lowered timers have expired (by default 2 seconds) + sleep 3 + brmcast_check_sg_entries "block" "${X[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"2001:db8:1::1\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 2001:db8:1::1 entry still exists" + + brmcast_check_sg_fwding 1 "${X[@]}" + brmcast_check_sg_fwding 0 2001:db8:1::100 + + log_test "MLDv2 report $TEST_GROUP include -> block" + + mldv2cleanup $swp1 +} + +mldv2exc_block_test() +{ + RET=0 + local X=("2001:db8:1::1" "2001:db8:1::2" "2001:db8:1::30") + local Y=("2001:db8:1::20" "2001:db8:1::21") + + mldv2exclude_prepare $h1 + + ip link set dev br0 type bridge mcast_last_member_interval 500 + check_err $? "Could not change mcast_last_member_interval to 5s" + + $MZ $h1 -c 1 $MZPKT_BLOCK -q + sleep 1 + brmcast_check_sg_entries "block" "${X[@]}" "${Y[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + brmcast_check_sg_state 1 "${Y[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" 2001:db8:1::100 + brmcast_check_sg_fwding 0 "${Y[@]}" + + log_test "MLDv2 report $TEST_GROUP exclude -> block" + + ip link set dev br0 type bridge mcast_last_member_interval 100 + + mldv2cleanup $swp1 +} + +mldv2exc_timeout_test() +{ + RET=0 + local X=("2001:db8:1::20" "2001:db8:1::30") + + # GMI should be 3 seconds + ip link set dev br0 type bridge mcast_query_interval 100 mcast_query_response_interval 100 + + mldv2exclude_prepare $h1 + ip link set dev br0 type bridge mcast_query_interval 500 mcast_query_response_interval 500 + $MZ $h1 -c 1 $MZPKT_ALLOW2 -q + sleep 3 + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and .filter_mode == \"include\")" &>/dev/null + check_err $? "Wrong *,G entry filter mode" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"2001:db8:1::1\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 2001:db8:1::1 entry still exists" + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and \ + .source_list != null and + .source_list[].address == \"2001:db8:1::2\")" &>/dev/null + check_fail $? "Wrong *,G entry source list, 2001:db8:1::2 entry still exists" + + brmcast_check_sg_entries "allow" "${X[@]}" + + brmcast_check_sg_state 0 "${X[@]}" + + brmcast_check_sg_fwding 1 "${X[@]}" + brmcast_check_sg_fwding 0 2001:db8:1::100 + + log_test "MLDv2 group $TEST_GROUP exclude timeout" + + ip link set dev br0 type bridge mcast_query_interval 12500 \ + mcast_query_response_interval 1000 + + mldv2cleanup $swp1 +} + +mldv2star_ex_auto_add_test() +{ + RET=0 + + mldv2exclude_prepare $h1 + + $MZ $h2 -c 1 $MZPKT_IS_INC -q + sleep 1 + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and .src == \"2001:db8:1::3\" and \ + .port == \"$swp1\")" &>/dev/null + check_err $? "S,G entry for *,G port doesn't exist" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and .src == \"2001:db8:1::3\" and \ + .port == \"$swp1\" and \ + .flags[] == \"added_by_star_ex\")" &>/dev/null + check_err $? "Auto-added S,G entry doesn't have added_by_star_ex flag" + + brmcast_check_sg_fwding 1 2001:db8:1::3 + + log_test "MLDv2 S,G port entry automatic add to a *,G port" + + mldv2cleanup $swp1 + mldv2cleanup $swp2 +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/config b/tools/testing/selftests/net/forwarding/config index da96eff72a8e..10e9a3321ae1 100644 --- a/tools/testing/selftests/net/forwarding/config +++ b/tools/testing/selftests/net/forwarding/config @@ -6,6 +6,9 @@ CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_NET_VRF=m CONFIG_BPF_SYSCALL=y CONFIG_CGROUP_BPF=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_MPLS=m +CONFIG_NET_ACT_VLAN=m CONFIG_NET_CLS_FLOWER=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_ACT_GACT=m diff --git a/tools/testing/selftests/net/forwarding/gre_multipath_nh.sh b/tools/testing/selftests/net/forwarding/gre_multipath_nh.sh new file mode 100755 index 000000000000..d03aa2cab9fd --- /dev/null +++ b/tools/testing/selftests/net/forwarding/gre_multipath_nh.sh @@ -0,0 +1,356 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test traffic distribution when a wECMP route forwards traffic to two GRE +# tunnels. +# +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.1/28 | | +# | 2001:db8:1::1/64 | | +# +-------------------|-----+ +# | +# +-------------------|------------------------+ +# | SW1 | | +# | $ol1 + | +# | 192.0.2.2/28 | +# | 2001:db8:1::2/64 | +# | | +# | + g1a (gre) + g1b (gre) | +# | loc=192.0.2.65 loc=192.0.2.81 | +# | rem=192.0.2.66 --. rem=192.0.2.82 --. | +# | tos=inherit | tos=inherit | | +# | .------------------' | | +# | | .------------------' | +# | v v | +# | + $ul1.111 (vlan) + $ul1.222 (vlan) | +# | | 192.0.2.129/28 | 192.0.2.145/28 | +# | \ / | +# | \________________/ | +# | | | +# | + $ul1 | +# +------------|-------------------------------+ +# | +# +------------|-------------------------------+ +# | SW2 + $ul2 | +# | _______|________ | +# | / \ | +# | / \ | +# | + $ul2.111 (vlan) + $ul2.222 (vlan) | +# | ^ 192.0.2.130/28 ^ 192.0.2.146/28 | +# | | | | +# | | '------------------. | +# | '------------------. | | +# | + g2a (gre) | + g2b (gre) | | +# | loc=192.0.2.66 | loc=192.0.2.82 | | +# | rem=192.0.2.65 --' rem=192.0.2.81 --' | +# | tos=inherit tos=inherit | +# | | +# | $ol2 + | +# | 192.0.2.17/28 | | +# | 2001:db8:2::1/64 | | +# +-------------------|------------------------+ +# | +# +-------------------|-----+ +# | H2 | | +# | $h2 + | +# | 192.0.2.18/28 | +# | 2001:db8:2::2/64 | +# +-------------------------+ + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + multipath_ipv4 + multipath_ipv6 + multipath_ipv6_l4 +" + +NUM_NETIFS=6 +source lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64 + ip route add vrf v$h1 192.0.2.16/28 via 192.0.2.2 + ip route add vrf v$h1 2001:db8:2::/64 via 2001:db8:1::2 +} + +h1_destroy() +{ + ip route del vrf v$h1 2001:db8:2::/64 via 2001:db8:1::2 + ip route del vrf v$h1 192.0.2.16/28 via 192.0.2.2 + simple_if_fini $h1 192.0.2.1/28 +} + +sw1_create() +{ + simple_if_init $ol1 192.0.2.2/28 2001:db8:1::2/64 + __simple_if_init $ul1 v$ol1 + vlan_create $ul1 111 v$ol1 192.0.2.129/28 + vlan_create $ul1 222 v$ol1 192.0.2.145/28 + + tunnel_create g1a gre 192.0.2.65 192.0.2.66 tos inherit dev v$ol1 + __simple_if_init g1a v$ol1 192.0.2.65/32 + ip route add vrf v$ol1 192.0.2.66/32 via 192.0.2.130 + + tunnel_create g1b gre 192.0.2.81 192.0.2.82 tos inherit dev v$ol1 + __simple_if_init g1b v$ol1 192.0.2.81/32 + ip route add vrf v$ol1 192.0.2.82/32 via 192.0.2.146 + + ip -6 nexthop add id 101 dev g1a + ip -6 nexthop add id 102 dev g1b + ip nexthop add id 103 group 101/102 + + ip route add vrf v$ol1 192.0.2.16/28 nhid 103 + ip route add vrf v$ol1 2001:db8:2::/64 nhid 103 +} + +sw1_destroy() +{ + ip route del vrf v$ol1 2001:db8:2::/64 + ip route del vrf v$ol1 192.0.2.16/28 + + ip nexthop del id 103 + ip -6 nexthop del id 102 + ip -6 nexthop del id 101 + + ip route del vrf v$ol1 192.0.2.82/32 via 192.0.2.146 + __simple_if_fini g1b 192.0.2.81/32 + tunnel_destroy g1b + + ip route del vrf v$ol1 192.0.2.66/32 via 192.0.2.130 + __simple_if_fini g1a 192.0.2.65/32 + tunnel_destroy g1a + + vlan_destroy $ul1 222 + vlan_destroy $ul1 111 + __simple_if_fini $ul1 + simple_if_fini $ol1 192.0.2.2/28 2001:db8:1::2/64 +} + +sw2_create() +{ + simple_if_init $ol2 192.0.2.17/28 2001:db8:2::1/64 + __simple_if_init $ul2 v$ol2 + vlan_create $ul2 111 v$ol2 192.0.2.130/28 + vlan_create $ul2 222 v$ol2 192.0.2.146/28 + + tunnel_create g2a gre 192.0.2.66 192.0.2.65 tos inherit dev v$ol2 + __simple_if_init g2a v$ol2 192.0.2.66/32 + ip route add vrf v$ol2 192.0.2.65/32 via 192.0.2.129 + + tunnel_create g2b gre 192.0.2.82 192.0.2.81 tos inherit dev v$ol2 + __simple_if_init g2b v$ol2 192.0.2.82/32 + ip route add vrf v$ol2 192.0.2.81/32 via 192.0.2.145 + + ip -6 nexthop add id 201 dev g2a + ip -6 nexthop add id 202 dev g2b + ip nexthop add id 203 group 201/202 + + ip route add vrf v$ol2 192.0.2.0/28 nhid 203 + ip route add vrf v$ol2 2001:db8:1::/64 nhid 203 + + tc qdisc add dev $ul2 clsact + tc filter add dev $ul2 ingress pref 111 prot 802.1Q \ + flower vlan_id 111 action pass + tc filter add dev $ul2 ingress pref 222 prot 802.1Q \ + flower vlan_id 222 action pass +} + +sw2_destroy() +{ + tc qdisc del dev $ul2 clsact + + ip route del vrf v$ol2 2001:db8:1::/64 + ip route del vrf v$ol2 192.0.2.0/28 + + ip nexthop del id 203 + ip -6 nexthop del id 202 + ip -6 nexthop del id 201 + + ip route del vrf v$ol2 192.0.2.81/32 via 192.0.2.145 + __simple_if_fini g2b 192.0.2.82/32 + tunnel_destroy g2b + + ip route del vrf v$ol2 192.0.2.65/32 via 192.0.2.129 + __simple_if_fini g2a 192.0.2.66/32 + tunnel_destroy g2a + + vlan_destroy $ul2 222 + vlan_destroy $ul2 111 + __simple_if_fini $ul2 + simple_if_fini $ol2 192.0.2.17/28 2001:db8:2::1/64 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.18/28 2001:db8:2::2/64 + ip route add vrf v$h2 192.0.2.0/28 via 192.0.2.17 + ip route add vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1 +} + +h2_destroy() +{ + ip route del vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1 + ip route del vrf v$h2 192.0.2.0/28 via 192.0.2.17 + simple_if_fini $h2 192.0.2.18/28 2001:db8:2::2/64 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + ol1=${NETIFS[p2]} + + ul1=${NETIFS[p3]} + ul2=${NETIFS[p4]} + + ol2=${NETIFS[p5]} + h2=${NETIFS[p6]} + + vrf_prepare + h1_create + sw1_create + sw2_create + h2_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + h2_destroy + sw2_destroy + sw1_destroy + h1_destroy + vrf_cleanup +} + +multipath4_test() +{ + local what=$1; shift + local weight1=$1; shift + local weight2=$1; shift + + sysctl_set net.ipv4.fib_multipath_hash_policy 1 + ip nexthop replace id 103 group 101,$weight1/102,$weight2 + + local t0_111=$(tc_rule_stats_get $ul2 111 ingress) + local t0_222=$(tc_rule_stats_get $ul2 222 ingress) + + ip vrf exec v$h1 \ + $MZ $h1 -q -p 64 -A 192.0.2.1 -B 192.0.2.18 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + local t1_111=$(tc_rule_stats_get $ul2 111 ingress) + local t1_222=$(tc_rule_stats_get $ul2 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + multipath_eval "$what" $weight1 $weight2 $d111 $d222 + + ip nexthop replace id 103 group 101/102 + sysctl_restore net.ipv4.fib_multipath_hash_policy +} + +multipath6_test() +{ + local what=$1; shift + local weight1=$1; shift + local weight2=$1; shift + + sysctl_set net.ipv6.fib_multipath_hash_policy 0 + ip nexthop replace id 103 group 101,$weight1/102,$weight2 + + local t0_111=$(tc_rule_stats_get $ul2 111 ingress) + local t0_222=$(tc_rule_stats_get $ul2 222 ingress) + + # Generate 16384 echo requests, each with a random flow label. + for ((i=0; i < 16384; ++i)); do + ip vrf exec v$h1 $PING6 2001:db8:2::2 -F 0 -c 1 -q &> /dev/null + done + + local t1_111=$(tc_rule_stats_get $ul2 111 ingress) + local t1_222=$(tc_rule_stats_get $ul2 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + multipath_eval "$what" $weight1 $weight2 $d111 $d222 + + ip nexthop replace id 103 group 101/102 + sysctl_restore net.ipv6.fib_multipath_hash_policy +} + +multipath6_l4_test() +{ + local what=$1; shift + local weight1=$1; shift + local weight2=$1; shift + + sysctl_set net.ipv6.fib_multipath_hash_policy 1 + ip nexthop replace id 103 group 101,$weight1/102,$weight2 + + local t0_111=$(tc_rule_stats_get $ul2 111 ingress) + local t0_222=$(tc_rule_stats_get $ul2 222 ingress) + + ip vrf exec v$h1 \ + $MZ $h1 -6 -q -p 64 -A 2001:db8:1::1 -B 2001:db8:2::2 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + local t1_111=$(tc_rule_stats_get $ul2 111 ingress) + local t1_222=$(tc_rule_stats_get $ul2 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + multipath_eval "$what" $weight1 $weight2 $d111 $d222 + + ip nexthop replace id 103 group 101/102 + sysctl_restore net.ipv6.fib_multipath_hash_policy +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.18 +} + +ping_ipv6() +{ + ping6_test $h1 2001:db8:2::2 +} + +multipath_ipv4() +{ + log_info "Running IPv4 multipath tests" + multipath4_test "ECMP" 1 1 + multipath4_test "Weighted MP 2:1" 2 1 + multipath4_test "Weighted MP 11:45" 11 45 +} + +multipath_ipv6() +{ + log_info "Running IPv6 multipath tests" + multipath6_test "ECMP" 1 1 + multipath6_test "Weighted MP 2:1" 2 1 + multipath6_test "Weighted MP 11:45" 11 45 +} + +multipath_ipv6_l4() +{ + log_info "Running IPv6 L4 hash multipath tests" + multipath6_l4_test "ECMP" 1 1 + multipath6_l4_test "Weighted MP 2:1" 2 1 + multipath6_l4_test "Weighted MP 11:45" 11 45 +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 927f9ba49e08..98ea37d26c44 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1270,3 +1270,110 @@ tcpdump_show() { tcpdump -e -n -r $capfile 2>&1 } + +# return 0 if the packet wasn't seen on host2_if or 1 if it was +mcast_packet_test() +{ + local mac=$1 + local src_ip=$2 + local ip=$3 + local host1_if=$4 + local host2_if=$5 + local seen=0 + local tc_proto="ip" + local mz_v6arg="" + + # basic check to see if we were passed an IPv4 address, if not assume IPv6 + if [[ ! $ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then + tc_proto="ipv6" + mz_v6arg="-6" + fi + + # Add an ACL on `host2_if` which will tell us whether the packet + # was received by it or not. + tc qdisc add dev $host2_if ingress + tc filter add dev $host2_if ingress protocol $tc_proto pref 1 handle 101 \ + flower ip_proto udp dst_mac $mac action drop + + $MZ $host1_if $mz_v6arg -c 1 -p 64 -b $mac -A $src_ip -B $ip -t udp "dp=4096,sp=2048" -q + sleep 1 + + tc -j -s filter show dev $host2_if ingress \ + | jq -e ".[] | select(.options.handle == 101) \ + | select(.options.actions[0].stats.packets == 1)" &> /dev/null + if [[ $? -eq 0 ]]; then + seen=1 + fi + + tc filter del dev $host2_if ingress protocol $tc_proto pref 1 handle 101 flower + tc qdisc del dev $host2_if ingress + + return $seen +} + +brmcast_check_sg_entries() +{ + local report=$1; shift + local slist=("$@") + local sarg="" + + for src in "${slist[@]}"; do + sarg="${sarg} and .source_list[].address == \"$src\"" + done + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and .source_list != null $sarg)" &>/dev/null + check_err $? "Wrong *,G entry source list after $report report" + + for sgent in "${slist[@]}"; do + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and .src == \"$sgent\")" &>/dev/null + check_err $? "Missing S,G entry ($sgent, $TEST_GROUP)" + done +} + +brmcast_check_sg_fwding() +{ + local should_fwd=$1; shift + local sources=("$@") + + for src in "${sources[@]}"; do + local retval=0 + + mcast_packet_test $TEST_GROUP_MAC $src $TEST_GROUP $h2 $h1 + retval=$? + if [ $should_fwd -eq 1 ]; then + check_fail $retval "Didn't forward traffic from S,G ($src, $TEST_GROUP)" + else + check_err $retval "Forwarded traffic for blocked S,G ($src, $TEST_GROUP)" + fi + done +} + +brmcast_check_sg_state() +{ + local is_blocked=$1; shift + local sources=("$@") + local should_fail=1 + + if [ $is_blocked -eq 1 ]; then + should_fail=0 + fi + + for src in "${sources[@]}"; do + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and .source_list != null) | + .source_list[] | + select(.address == \"$src\") | + select(.timer == \"0.00\")" &>/dev/null + check_err_fail $should_fail $? "Entry $src has zero timer" + + bridge -j -d -s mdb show dev br0 \ + | jq -e ".[].mdb[] | \ + select(.grp == \"$TEST_GROUP\" and .src == \"$src\" and \ + .flags[] == \"blocked\")" &>/dev/null + check_err_fail $should_fail $? "Entry $src has blocked flag" + done +} diff --git a/tools/testing/selftests/net/forwarding/q_in_vni.sh b/tools/testing/selftests/net/forwarding/q_in_vni.sh new file mode 100755 index 000000000000..4c50c0234bce --- /dev/null +++ b/tools/testing/selftests/net/forwarding/q_in_vni.sh @@ -0,0 +1,347 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +-----------------------+ +------------------------+ +# | H1 (vrf) | | H2 (vrf) | +# | + $h1.10 | | + $h2.10 | +# | | 192.0.2.1/28 | | | 192.0.2.2/28 | +# | | | | | | +# | | + $h1.20 | | | + $h2.20 | +# | \ | 198.51.100.1/24 | | \ | 198.51.100.2/24 | +# | \| | | \| | +# | + $h1 | | + $h2 | +# +----|------------------+ +----|-------------------+ +# | | +# +----|--------------------------------------------------|-------------------+ +# | SW | | | +# | +--|--------------------------------------------------|-----------------+ | +# | | + $swp1 BR1 (802.1ad) + $swp2 | | +# | | vid 100 pvid untagged vid 100 pvid | | +# | | untagged | | +# | | + vx100 (vxlan) | | +# | | local 192.0.2.17 | | +# | | remote 192.0.2.34 192.0.2.50 | | +# | | id 1000 dstport $VXPORT | | +# | | vid 100 pvid untagged | | +# | +-----------------------------------------------------------------------+ | +# | | +# | 192.0.2.32/28 via 192.0.2.18 | +# | 192.0.2.48/28 via 192.0.2.18 | +# | | +# | + $rp1 | +# | | 192.0.2.17/28 | +# +----|----------------------------------------------------------------------+ +# | +# +----|--------------------------------------------------------+ +# | | VRP2 (vrf) | +# | + $rp2 | +# | 192.0.2.18/28 | +# | | (maybe) HW +# ============================================================================= +# | | (likely) SW +# | + v1 (veth) + v3 (veth) | +# | | 192.0.2.33/28 | 192.0.2.49/28 | +# +----|---------------------------------------|----------------+ +# | | +# +----|------------------------------+ +----|------------------------------+ +# | + v2 (veth) NS1 (netns) | | + v4 (veth) NS2 (netns) | +# | 192.0.2.34/28 | | 192.0.2.50/28 | +# | | | | +# | 192.0.2.16/28 via 192.0.2.33 | | 192.0.2.16/28 via 192.0.2.49 | +# | 192.0.2.50/32 via 192.0.2.33 | | 192.0.2.34/32 via 192.0.2.49 | +# | | | | +# | +-------------------------------+ | | +-------------------------------+ | +# | | BR2 (802.1ad) | | | | BR2 (802.1ad) | | +# | | + vx100 (vxlan) | | | | + vx100 (vxlan) | | +# | | local 192.0.2.34 | | | | local 192.0.2.50 | | +# | | remote 192.0.2.17 | | | | remote 192.0.2.17 | | +# | | remote 192.0.2.50 | | | | remote 192.0.2.34 | | +# | | id 1000 dstport $VXPORT | | | | id 1000 dstport $VXPORT | | +# | | vid 100 pvid untagged | | | | vid 100 pvid untagged | | +# | | | | | | | | +# | | + w1 (veth) | | | | + w1 (veth) | | +# | | | vid 100 pvid untagged | | | | | vid 100 pvid untagged | | +# | +--|----------------------------+ | | +--|----------------------------+ | +# | | | | | | +# | +--|----------------------------+ | | +--|----------------------------+ | +# | | | VW2 (vrf) | | | | | VW2 (vrf) | | +# | | + w2 (veth) | | | | + w2 (veth) | | +# | | |\ | | | | |\ | | +# | | | + w2.10 | | | | | + w2.10 | | +# | | | 192.0.2.3/28 | | | | | 192.0.2.4/28 | | +# | | | | | | | | | | +# | | + w2.20 | | | | + w2.20 | | +# | | 198.51.100.3/24 | | | | 198.51.100.4/24 | | +# | +-------------------------------+ | | +-------------------------------+ | +# +-----------------------------------+ +-----------------------------------+ + +: ${VXPORT:=4789} +export VXPORT + +: ${ALL_TESTS:=" + ping_ipv4 + "} + +NUM_NETIFS=6 +source lib.sh + +h1_create() +{ + simple_if_init $h1 + tc qdisc add dev $h1 clsact + vlan_create $h1 10 v$h1 192.0.2.1/28 + vlan_create $h1 20 v$h1 198.51.100.1/24 +} + +h1_destroy() +{ + vlan_destroy $h1 20 + vlan_destroy $h1 10 + tc qdisc del dev $h1 clsact + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + tc qdisc add dev $h2 clsact + vlan_create $h2 10 v$h2 192.0.2.2/28 + vlan_create $h2 20 v$h2 198.51.100.2/24 +} + +h2_destroy() +{ + vlan_destroy $h2 20 + vlan_destroy $h2 10 + tc qdisc del dev $h2 clsact + simple_if_fini $h2 +} + +rp1_set_addr() +{ + ip address add dev $rp1 192.0.2.17/28 + + ip route add 192.0.2.32/28 nexthop via 192.0.2.18 + ip route add 192.0.2.48/28 nexthop via 192.0.2.18 +} + +rp1_unset_addr() +{ + ip route del 192.0.2.48/28 nexthop via 192.0.2.18 + ip route del 192.0.2.32/28 nexthop via 192.0.2.18 + + ip address del dev $rp1 192.0.2.17/28 +} + +switch_create() +{ + ip link add name br1 type bridge vlan_filtering 1 vlan_protocol 802.1ad \ + vlan_default_pvid 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip link set dev br1 address $(mac_get $swp1) + ip link set dev br1 up + + ip link set dev $rp1 up + rp1_set_addr + + ip link add name vx100 type vxlan id 1000 \ + local 192.0.2.17 dstport "$VXPORT" \ + nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx100 up + + ip link set dev vx100 master br1 + bridge vlan add vid 100 dev vx100 pvid untagged + + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + bridge vlan add vid 100 dev $swp1 pvid untagged + + ip link set dev $swp2 master br1 + ip link set dev $swp2 up + bridge vlan add vid 100 dev $swp2 pvid untagged + + bridge fdb append dev vx100 00:00:00:00:00:00 dst 192.0.2.34 self + bridge fdb append dev vx100 00:00:00:00:00:00 dst 192.0.2.50 self +} + +switch_destroy() +{ + bridge fdb del dev vx100 00:00:00:00:00:00 dst 192.0.2.50 self + bridge fdb del dev vx100 00:00:00:00:00:00 dst 192.0.2.34 self + + bridge vlan del vid 100 dev $swp2 + ip link set dev $swp2 down + ip link set dev $swp2 nomaster + + bridge vlan del vid 100 dev $swp1 + ip link set dev $swp1 down + ip link set dev $swp1 nomaster + + ip link set dev vx100 nomaster + ip link set dev vx100 down + ip link del dev vx100 + + rp1_unset_addr + ip link set dev $rp1 down + + ip link set dev br1 down + ip link del dev br1 +} + +vrp2_create() +{ + simple_if_init $rp2 192.0.2.18/28 + __simple_if_init v1 v$rp2 192.0.2.33/28 + __simple_if_init v3 v$rp2 192.0.2.49/28 + tc qdisc add dev v1 clsact +} + +vrp2_destroy() +{ + tc qdisc del dev v1 clsact + __simple_if_fini v3 192.0.2.49/28 + __simple_if_fini v1 192.0.2.33/28 + simple_if_fini $rp2 192.0.2.18/28 +} + +ns_init_common() +{ + local in_if=$1; shift + local in_addr=$1; shift + local other_in_addr=$1; shift + local nh_addr=$1; shift + local host_addr1=$1; shift + local host_addr2=$1; shift + + ip link set dev $in_if up + ip address add dev $in_if $in_addr/28 + tc qdisc add dev $in_if clsact + + ip link add name br2 type bridge vlan_filtering 1 vlan_protocol 802.1ad \ + vlan_default_pvid 0 + ip link set dev br2 up + + ip link add name w1 type veth peer name w2 + + ip link set dev w1 master br2 + ip link set dev w1 up + bridge vlan add vid 100 dev w1 pvid untagged + + ip link add name vx100 type vxlan id 1000 local $in_addr \ + dstport "$VXPORT" + ip link set dev vx100 up + bridge fdb append dev vx100 00:00:00:00:00:00 dst 192.0.2.17 self + bridge fdb append dev vx100 00:00:00:00:00:00 dst $other_in_addr self + + ip link set dev vx100 master br2 + tc qdisc add dev vx100 clsact + + bridge vlan add vid 100 dev vx100 pvid untagged + + simple_if_init w2 + vlan_create w2 10 vw2 $host_addr1/28 + vlan_create w2 20 vw2 $host_addr2/24 + + ip route add 192.0.2.16/28 nexthop via $nh_addr + ip route add $other_in_addr/32 nexthop via $nh_addr +} +export -f ns_init_common + +ns1_create() +{ + ip netns add ns1 + ip link set dev v2 netns ns1 + in_ns ns1 \ + ns_init_common v2 192.0.2.34 192.0.2.50 192.0.2.33 \ + 192.0.2.3 198.51.100.3 +} + +ns1_destroy() +{ + ip netns exec ns1 ip link set dev v2 netns 1 + ip netns del ns1 +} + +ns2_create() +{ + ip netns add ns2 + ip link set dev v4 netns ns2 + in_ns ns2 \ + ns_init_common v4 192.0.2.50 192.0.2.34 192.0.2.49 \ + 192.0.2.4 198.51.100.4 +} + +ns2_destroy() +{ + ip netns exec ns2 ip link set dev v4 netns 1 + ip netns del ns2 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + rp1=${NETIFS[p5]} + rp2=${NETIFS[p6]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + switch_create + + ip link add name v1 type veth peer name v2 + ip link add name v3 type veth peer name v4 + vrp2_create + ns1_create + ns2_create + + r1_mac=$(in_ns ns1 mac_get w2) + r2_mac=$(in_ns ns2 mac_get w2) + h2_mac=$(mac_get $h2) +} + +cleanup() +{ + pre_cleanup + + ns2_destroy + ns1_destroy + vrp2_destroy + ip link del dev v3 + ip link del dev v1 + + switch_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.2 ": local->local" + ping_test $h1 192.0.2.3 ": local->remote 1" + ping_test $h1 192.0.2.4 ": local->remote 2" +} + +test_all() +{ + echo "Running tests with UDP port $VXPORT" + tests_run +} + +trap cleanup EXIT + +setup_prepare +setup_wait +test_all + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh index cf3d26c233e8..388e4492b81b 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh @@ -1,7 +1,13 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="ping_ipv4 ping_ipv6 multipath_test" +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + multipath_test + ping_ipv4_blackhole + ping_ipv6_blackhole +" NUM_NETIFS=8 source lib.sh @@ -280,6 +286,17 @@ multipath_test() multipath4_test "Weighted MP 2:1" 2 1 multipath4_test "Weighted MP 11:45" 11 45 + log_info "Running IPv4 multipath tests with IPv6 link-local nexthops" + ip nexthop replace id 101 via fe80:2::22 dev $rp12 + ip nexthop replace id 102 via fe80:3::23 dev $rp13 + + multipath4_test "ECMP" 1 1 + multipath4_test "Weighted MP 2:1" 2 1 + multipath4_test "Weighted MP 11:45" 11 45 + + ip nexthop replace id 102 via 169.254.3.23 dev $rp13 + ip nexthop replace id 101 via 169.254.2.22 dev $rp12 + log_info "Running IPv6 multipath tests" multipath6_test "ECMP" 1 1 multipath6_test "Weighted MP 2:1" 2 1 @@ -291,6 +308,56 @@ multipath_test() multipath6_l4_test "Weighted MP 11:45" 11 45 } +ping_ipv4_blackhole() +{ + RET=0 + + ip nexthop add id 1001 blackhole + ip nexthop add id 1002 group 1001 + + ip route replace 198.51.100.0/24 vrf vrf-r1 nhid 1001 + ping_do $h1 198.51.100.2 + check_fail $? "ping did not fail when using a blackhole nexthop" + + ip route replace 198.51.100.0/24 vrf vrf-r1 nhid 1002 + ping_do $h1 198.51.100.2 + check_fail $? "ping did not fail when using a blackhole nexthop group" + + ip route replace 198.51.100.0/24 vrf vrf-r1 nhid 103 + ping_do $h1 198.51.100.2 + check_err $? "ping failed with a valid nexthop" + + log_test "IPv4 blackhole ping" + + ip nexthop del id 1002 + ip nexthop del id 1001 +} + +ping_ipv6_blackhole() +{ + RET=0 + + ip -6 nexthop add id 1001 blackhole + ip nexthop add id 1002 group 1001 + + ip route replace 2001:db8:2::/64 vrf vrf-r1 nhid 1001 + ping6_do $h1 2001:db8:2::2 + check_fail $? "ping did not fail when using a blackhole nexthop" + + ip route replace 2001:db8:2::/64 vrf vrf-r1 nhid 1002 + ping6_do $h1 2001:db8:2::2 + check_fail $? "ping did not fail when using a blackhole nexthop group" + + ip route replace 2001:db8:2::/64 vrf vrf-r1 nhid 106 + ping6_do $h1 2001:db8:2::2 + check_err $? "ping failed with a valid nexthop" + + log_test "IPv6 blackhole ping" + + ip nexthop del id 1002 + ip -6 nexthop del id 1001 +} + setup_prepare() { h1=${NETIFS[p1]} @@ -312,7 +379,6 @@ setup_prepare() router1_create router2_create - routing_nh_obj forwarding_enable } diff --git a/tools/testing/selftests/net/forwarding/router_nh.sh b/tools/testing/selftests/net/forwarding/router_nh.sh new file mode 100755 index 000000000000..f3a53738bdcc --- /dev/null +++ b/tools/testing/selftests/net/forwarding/router_nh.sh @@ -0,0 +1,160 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 +" + +NUM_NETIFS=4 +source lib.sh +source tc_common.sh + +h1_create() +{ + vrf_create "vrf-h1" + ip link set dev $h1 master vrf-h1 + + ip link set dev vrf-h1 up + ip link set dev $h1 up + + ip address add 192.0.2.2/24 dev $h1 + ip address add 2001:db8:1::2/64 dev $h1 + + ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1 + ip route add 2001:db8:2::/64 vrf vrf-h1 nexthop via 2001:db8:1::1 +} + +h1_destroy() +{ + ip route del 2001:db8:2::/64 vrf vrf-h1 + ip route del 198.51.100.0/24 vrf vrf-h1 + + ip address del 2001:db8:1::2/64 dev $h1 + ip address del 192.0.2.2/24 dev $h1 + + ip link set dev $h1 down + vrf_destroy "vrf-h1" +} + +h2_create() +{ + vrf_create "vrf-h2" + ip link set dev $h2 master vrf-h2 + + ip link set dev vrf-h2 up + ip link set dev $h2 up + + ip address add 198.51.100.2/24 dev $h2 + ip address add 2001:db8:2::2/64 dev $h2 + + ip route add 192.0.2.0/24 vrf vrf-h2 nexthop via 198.51.100.1 + ip route add 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1 +} + +h2_destroy() +{ + ip route del 2001:db8:1::/64 vrf vrf-h2 + ip route del 192.0.2.0/24 vrf vrf-h2 + + ip address del 2001:db8:2::2/64 dev $h2 + ip address del 198.51.100.2/24 dev $h2 + + ip link set dev $h2 down + vrf_destroy "vrf-h2" +} + +router_create() +{ + ip link set dev $rp1 up + ip link set dev $rp2 up + + tc qdisc add dev $rp2 clsact + + ip address add 192.0.2.1/24 dev $rp1 + ip address add 2001:db8:1::1/64 dev $rp1 + + ip address add 198.51.100.1/24 dev $rp2 + ip address add 2001:db8:2::1/64 dev $rp2 +} + +router_destroy() +{ + ip address del 2001:db8:2::1/64 dev $rp2 + ip address del 198.51.100.1/24 dev $rp2 + + ip address del 2001:db8:1::1/64 dev $rp1 + ip address del 192.0.2.1/24 dev $rp1 + + tc qdisc del dev $rp2 clsact + + ip link set dev $rp2 down + ip link set dev $rp1 down +} + +routing_nh_obj() +{ + # Create the nexthops as AF_INET6, so that IPv4 and IPv6 routes could + # use them. + ip -6 nexthop add id 101 dev $rp1 + ip -6 nexthop add id 102 dev $rp2 + + ip route replace 192.0.2.0/24 nhid 101 + ip route replace 2001:db8:1::/64 nhid 101 + ip route replace 198.51.100.0/24 nhid 102 + ip route replace 2001:db8:2::/64 nhid 102 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + rp1mac=$(mac_get $rp1) + + vrf_prepare + + h1_create + h2_create + + router_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + router_destroy + + h2_destroy + h1_destroy + + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 198.51.100.2 +} + +ping_ipv6() +{ + ping6_test $h1 2001:db8:2::2 +} + +trap cleanup EXIT + +setup_prepare +setup_wait +routing_nh_obj + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/tc_mpls_l2vpn.sh b/tools/testing/selftests/net/forwarding/tc_mpls_l2vpn.sh new file mode 100755 index 000000000000..03743f04e178 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/tc_mpls_l2vpn.sh @@ -0,0 +1,192 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +-----------------------+ +# | H1 (v$h1) | +# | 192.0.2.1/24 | +# | 2001:db8::1/124 | +# | + $h1 | +# +-----------------|-----+ +# | +# | (Plain Ethernet traffic) +# | +# +-----------------|-----------------------------------------+ +# | LER1 + $edge1 | +# | -ingress: | +# | -encapsulate Ethernet into MPLS | +# | -add outer Ethernet header | +# | -redirect to $mpls1 (egress) | +# | | +# | + $mpls1 | +# | | -ingress: | +# | | -remove outer Ethernet header | +# | | -remove MPLS header | +# | | -redirect to $edge1 (egress) | +# +-----------------|-----------------------------------------+ +# | +# | (Ethernet over MPLS traffic) +# | +# +-----------------|-----------------------------------------+ +# | LER2 + $mpls2 | +# | -ingress: | +# | -remove outer Ethernet header | +# | -remove MPLS header | +# | -redirect to $edge2 (egress) | +# | | +# | + $edge2 | +# | | -ingress: | +# | | -encapsulate Ethernet into MPLS | +# | | -add outer Ethernet header | +# | | -redirect to $mpls2 (egress) | +# +-----------------|-----------------------------------------| +# | +# | (Plain Ethernet traffic) +# | +# +-----------------|-----+ +# | H2 (v$h2) | | +# | + $h2 | +# | 192.0.2.2/24 | +# | 2001:db8::2/124 | +# +-----------------------+ +# +# LER1 and LER2 logically represent two different routers. However, no VRF is +# created for them, as they don't do any IP routing. + +ALL_TESTS="mpls_forward_eth" +NUM_NETIFS=6 +source lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 2001:db8::1/124 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/24 2001:db8::1/124 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.2/24 2001:db8::2/124 +} + +h2_destroy() +{ + simple_if_fini $h2 192.0.2.2/24 2001:db8::2/124 +} + +ler1_create() +{ + tc qdisc add dev $edge1 ingress + tc filter add dev $edge1 ingress \ + matchall \ + action mpls mac_push label 102 \ + action vlan push_eth dst_mac $mpls2mac src_mac $mpls1mac \ + action mirred egress redirect dev $mpls1 + ip link set dev $edge1 up + + tc qdisc add dev $mpls1 ingress + tc filter add dev $mpls1 ingress \ + protocol mpls_uc \ + flower mpls_label 101 \ + action vlan pop_eth \ + action mpls pop protocol teb \ + action mirred egress redirect dev $edge1 + ip link set dev $mpls1 up +} + +ler1_destroy() +{ + ip link set dev $mpls1 down + tc qdisc del dev $mpls1 ingress + + ip link set dev $edge1 down + tc qdisc del dev $edge1 ingress +} + +ler2_create() +{ + tc qdisc add dev $edge2 ingress + tc filter add dev $edge2 ingress \ + matchall \ + action mpls mac_push label 101 \ + action vlan push_eth dst_mac $mpls1mac src_mac $mpls2mac \ + action mirred egress redirect dev $mpls2 + ip link set dev $edge2 up + + tc qdisc add dev $mpls2 ingress + tc filter add dev $mpls2 ingress \ + protocol mpls_uc \ + flower mpls_label 102 \ + action vlan pop_eth \ + action mpls pop protocol teb \ + action mirred egress redirect dev $edge2 + ip link set dev $mpls2 up +} + +ler2_destroy() +{ + ip link set dev $mpls2 down + tc qdisc del dev $mpls2 ingress + + ip link set dev $edge2 down + tc qdisc del dev $edge2 ingress +} + +mpls_forward_eth() +{ + ping_test $h1 192.0.2.2 + ping6_test $h1 2001:db8::2 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + edge1=${NETIFS[p2]} + + mpls1=${NETIFS[p3]} + mpls2=${NETIFS[p4]} + + edge2=${NETIFS[p5]} + h2=${NETIFS[p6]} + + mpls1mac=$(mac_get $mpls1) + mpls2mac=$(mac_get $mpls2) + + vrf_prepare + + h1_create + h2_create + ler1_create + ler2_create +} + +cleanup() +{ + pre_cleanup + + ler2_destroy + ler1_destroy + h2_destroy + h1_destroy + + vrf_cleanup +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +tc_offload_check +if [[ $? -ne 0 ]]; then + log_info "Could not test offloaded functionality" +else + tcflags="skip_sw" + tests_run +fi + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config index 741a1c4f4ae8..0faaccd21447 100644 --- a/tools/testing/selftests/net/mptcp/config +++ b/tools/testing/selftests/net/mptcp/config @@ -5,3 +5,13 @@ CONFIG_INET_DIAG=m CONFIG_INET_MPTCP_DIAG=m CONFIG_VETH=y CONFIG_NET_SCH_NETEM=m +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NF_TABLES=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_COMPAT=m +CONFIG_NETFILTER_XTABLES=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NF_TABLES_IPV6=y diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 08f53d86dedc..9aa9624cff97 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -5,6 +5,7 @@ ret=0 sin="" sout="" cin="" +cinsent="" cout="" ksft_skip=4 timeout=30 @@ -13,6 +14,24 @@ capture=0 TEST_COUNT=0 +# generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) || +# (ip6 && (ip6[74] & 0xf0) == 0x30)'" +CBPF_MPTCP_SUBOPTION_ADD_ADDR="14, + 48 0 0 0, + 84 0 0 240, + 21 0 3 64, + 48 0 0 54, + 84 0 0 240, + 21 6 7 48, + 48 0 0 0, + 84 0 0 240, + 21 0 4 96, + 48 0 0 74, + 84 0 0 240, + 21 0 1 48, + 6 0 0 65535, + 6 0 0 0" + init() { capout=$(mktemp) @@ -63,7 +82,7 @@ cleanup_partial() cleanup() { rm -f "$cin" "$cout" - rm -f "$sin" "$sout" + rm -f "$sin" "$sout" "$cinsent" cleanup_partial } @@ -82,6 +101,26 @@ reset_with_cookies() done } +reset_with_add_addr_timeout() +{ + local ip="${1:-4}" + local tables + + tables="iptables" + if [ $ip -eq 6 ]; then + tables="ip6tables" + fi + + reset + + ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1 + ip netns exec $ns2 $tables -A OUTPUT -p tcp \ + -m tcp --tcp-option 30 \ + -m bpf --bytecode \ + "$CBPF_MPTCP_SUBOPTION_ADD_ADDR" \ + -j DROP +} + for arg in "$@"; do if [ "$arg" = "-c" ]; then capture=1 @@ -94,6 +133,24 @@ if [ $? -ne 0 ];then exit $ksft_skip fi +iptables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run all tests without iptables tool" + exit $ksft_skip +fi + +ip6tables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run all tests without ip6tables tool" + exit $ksft_skip +fi + +print_file_err() +{ + ls -l "$1" 1>&2 + echo "Trailing bytes are: " + tail -c 27 "$1" +} check_transfer() { @@ -106,6 +163,7 @@ check_transfer() echo "[ FAIL ] $what does not match (in, out):" print_file_err "$in" print_file_err "$out" + ret=1 return 1 fi @@ -126,6 +184,23 @@ do_ping() fi } +link_failure() +{ + ns="$1" + + l=$((RANDOM%4)) + l=$((l+1)) + + veth="ns1eth$l" + ip -net "$ns" link set "$veth" down +} + +# $1: IP address +is_v6() +{ + [ -z "${1##*:*}" ] +} + do_transfer() { listener_ns="$1" @@ -133,8 +208,10 @@ do_transfer() cl_proto="$3" srv_proto="$4" connect_addr="$5" - rm_nr_ns1="$6" - rm_nr_ns2="$7" + test_link_fail="$6" + rm_nr_ns1="$7" + rm_nr_ns2="$8" + speed="$9" port=$((10000+$TEST_COUNT)) TEST_COUNT=$((TEST_COUNT+1)) @@ -159,42 +236,65 @@ do_transfer() sleep 1 fi - if [[ $rm_nr_ns1 -eq 0 && $rm_nr_ns2 -eq 0 ]]; then + if [ $speed = "fast" ]; then mptcp_connect="./mptcp_connect -j" else mptcp_connect="./mptcp_connect -r" fi - ip netns exec ${listener_ns} $mptcp_connect -t $timeout -l -p $port -s ${srv_proto} 0.0.0.0 < "$sin" > "$sout" & + local local_addr + if is_v6 "${connect_addr}"; then + local_addr="::" + else + local_addr="0.0.0.0" + fi + + ip netns exec ${listener_ns} $mptcp_connect -t $timeout -l -p $port \ + -s ${srv_proto} ${local_addr} < "$sin" > "$sout" & spid=$! sleep 1 - ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" & + if [ "$test_link_fail" -eq 0 ];then + ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" & + else + ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | tee "$cinsent" | \ + ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr > "$cout" & + fi cpid=$! if [ $rm_nr_ns1 -gt 0 ]; then - counter=1 - sleep 1 + if [ $rm_nr_ns1 -lt 8 ]; then + counter=1 + sleep 1 - while [ $counter -le $rm_nr_ns1 ] - do - ip netns exec ${listener_ns} ./pm_nl_ctl del $counter + while [ $counter -le $rm_nr_ns1 ] + do + ip netns exec ${listener_ns} ./pm_nl_ctl del $counter + sleep 1 + let counter+=1 + done + else sleep 1 - let counter+=1 - done + ip netns exec ${listener_ns} ./pm_nl_ctl flush + fi fi if [ $rm_nr_ns2 -gt 0 ]; then - counter=1 - sleep 1 + if [ $rm_nr_ns2 -lt 8 ]; then + counter=1 + sleep 1 - while [ $counter -le $rm_nr_ns2 ] - do - ip netns exec ${connector_ns} ./pm_nl_ctl del $counter + while [ $counter -le $rm_nr_ns2 ] + do + ip netns exec ${connector_ns} ./pm_nl_ctl del $counter + sleep 1 + let counter+=1 + done + else sleep 1 - let counter+=1 - done + ip netns exec ${connector_ns} ./pm_nl_ctl flush + fi fi wait $cpid @@ -215,12 +315,17 @@ do_transfer() ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port" cat "$capout" + ret=1 return 1 fi check_transfer $sin $cout "file received by client" retc=$? - check_transfer $cin $sout "file received by server" + if [ "$test_link_fail" -eq 0 ];then + check_transfer $cin $sout "file received by server" + else + check_transfer $cinsent $sout "file received by server" + fi rets=$? if [ $retc -eq 0 ] && [ $rets -eq 0 ];then @@ -236,13 +341,12 @@ make_file() { name=$1 who=$2 + size=$3 - SIZE=1 - - dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null + dd if=/dev/urandom of="$name" bs=1024 count=$size 2> /dev/null echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name" - echo "Created $name (size $SIZE KB) containing data sent by $who" + echo "Created $name (size $size KB) containing data sent by $who" } run_tests() @@ -250,27 +354,32 @@ run_tests() listener_ns="$1" connector_ns="$2" connect_addr="$3" + test_linkfail="${4:-0}" + rm_nr_ns1="${5:-0}" + rm_nr_ns2="${6:-0}" + speed="${7:-fast}" lret=0 + oldin="" - do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} 0 0 - lret=$? - if [ $lret -ne 0 ]; then - ret=$lret - return - fi -} + if [ "$test_linkfail" -eq 1 ];then + size=$((RANDOM%1024)) + size=$((size+1)) + size=$((size*128)) -run_remove_tests() -{ - listener_ns="$1" - connector_ns="$2" - connect_addr="$3" - rm_nr_ns1="$4" - rm_nr_ns2="$5" - lret=0 + oldin=$(mktemp) + cp "$cin" "$oldin" + make_file "$cin" "client" $size + fi - do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${rm_nr_ns1} ${rm_nr_ns2} + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} \ + ${test_linkfail} ${rm_nr_ns1} ${rm_nr_ns2} ${speed} lret=$? + + if [ "$test_linkfail" -eq 1 ];then + cp "$oldin" "$cin" + rm -f "$oldin" + fi + if [ $lret -ne 0 ]; then ret=$lret return @@ -403,10 +512,11 @@ chk_rm_nr() sin=$(mktemp) sout=$(mktemp) cin=$(mktemp) +cinsent=$(mktemp) cout=$(mktemp) init -make_file "$cin" "client" -make_file "$sin" "server" +make_file "$cin" "client" 1 +make_file "$sin" "server" 1 trap cleanup EXIT run_tests $ns1 $ns2 10.0.1.1 @@ -491,12 +601,32 @@ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr "multiple subflows and signal" 3 3 3 chk_add_nr 1 1 +# accept and use add_addr with additional subflows and link loss +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 3 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 3 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 1 +chk_join_nr "multiple flows, signal, link failure" 3 3 3 +chk_add_nr 1 1 + +# add_addr timeout +reset_with_add_addr_timeout +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 1 1 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow +chk_join_nr "signal address, ADD_ADDR timeout" 1 1 1 +chk_add_nr 4 0 + # single subflow, remove reset ip netns exec $ns1 ./pm_nl_ctl limits 0 1 ip netns exec $ns2 ./pm_nl_ctl limits 0 1 ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow -run_remove_tests $ns1 $ns2 10.0.1.1 0 1 +run_tests $ns1 $ns2 10.0.1.1 0 0 1 slow chk_join_nr "remove single subflow" 1 1 1 chk_rm_nr 1 1 @@ -506,7 +636,7 @@ ip netns exec $ns1 ./pm_nl_ctl limits 0 2 ip netns exec $ns2 ./pm_nl_ctl limits 0 2 ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow -run_remove_tests $ns1 $ns2 10.0.1.1 0 2 +run_tests $ns1 $ns2 10.0.1.1 0 0 2 slow chk_join_nr "remove multiple subflows" 2 2 2 chk_rm_nr 2 2 @@ -515,7 +645,7 @@ reset ip netns exec $ns1 ./pm_nl_ctl limits 0 1 ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal ip netns exec $ns2 ./pm_nl_ctl limits 1 1 -run_remove_tests $ns1 $ns2 10.0.1.1 1 0 +run_tests $ns1 $ns2 10.0.1.1 0 1 0 slow chk_join_nr "remove single address" 1 1 1 chk_add_nr 1 1 chk_rm_nr 0 0 @@ -526,7 +656,7 @@ ip netns exec $ns1 ./pm_nl_ctl limits 0 2 ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal ip netns exec $ns2 ./pm_nl_ctl limits 1 2 ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow -run_remove_tests $ns1 $ns2 10.0.1.1 1 1 +run_tests $ns1 $ns2 10.0.1.1 0 1 1 slow chk_join_nr "remove subflow and signal" 2 2 2 chk_add_nr 1 1 chk_rm_nr 1 1 @@ -538,11 +668,77 @@ ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal ip netns exec $ns2 ./pm_nl_ctl limits 1 3 ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow -run_remove_tests $ns1 $ns2 10.0.1.1 1 2 +run_tests $ns1 $ns2 10.0.1.1 0 1 2 slow chk_join_nr "remove subflows and signal" 3 3 3 chk_add_nr 1 1 chk_rm_nr 2 2 +# subflows and signal, flush +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 3 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 3 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 0 8 8 slow +chk_join_nr "flush subflows and signal" 3 3 3 +chk_add_nr 1 1 +chk_rm_nr 2 2 + +# subflow IPv6 +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 flags subflow +run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow +chk_join_nr "single subflow IPv6" 1 1 1 + +# add_address, unused IPv6 +reset +ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal +run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow +chk_join_nr "unused signal address IPv6" 0 0 0 +chk_add_nr 1 1 + +# signal address IPv6 +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 1 +run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow +chk_join_nr "single address IPv6" 1 1 1 +chk_add_nr 1 1 + +# add_addr timeout IPv6 +reset_with_add_addr_timeout 6 +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 1 1 +ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal +run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow +chk_join_nr "signal address, ADD_ADDR6 timeout" 1 1 1 +chk_add_nr 4 0 + +# single address IPv6, remove +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 1 +run_tests $ns1 $ns2 dead:beef:1::1 0 1 0 slow +chk_join_nr "remove single address IPv6" 1 1 1 +chk_add_nr 1 1 +chk_rm_nr 0 0 + +# subflow and signal IPv6, remove +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 2 +ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 2 +ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 flags subflow +run_tests $ns1 $ns2 dead:beef:1::1 0 1 1 slow +chk_join_nr "remove subflow and signal IPv6" 2 2 2 +chk_add_nr 1 1 +chk_rm_nr 1 1 + # single subflow, syncookies reset_with_cookies ip netns exec $ns1 ./pm_nl_ctl limits 0 1 diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 6bbf69a28e12..464e31eabc73 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -355,7 +355,7 @@ setup_fou_or_gue() { encap="${3}" if [ "${outer}" = "4" ]; then - modprobe fou || return 2 + modprobe fou || return $ksft_skip a_addr="${prefix4}.${a_r1}.1" b_addr="${prefix4}.${b_r1}.1" if [ "${inner}" = "4" ]; then @@ -366,7 +366,7 @@ setup_fou_or_gue() { ipproto="41" fi else - modprobe fou6 || return 2 + modprobe fou6 || return $ksft_skip a_addr="${prefix6}:${a_r1}::1" b_addr="${prefix6}:${b_r1}::1" if [ "${inner}" = "4" ]; then @@ -380,8 +380,8 @@ setup_fou_or_gue() { fi fi - run_cmd ${ns_a} ip fou add port 5555 ipproto ${ipproto} || return 2 - run_cmd ${ns_a} ip link add ${encap}_a type ${type} ${mode} local ${a_addr} remote ${b_addr} encap ${encap} encap-sport auto encap-dport 5556 || return 2 + run_cmd ${ns_a} ip fou add port 5555 ipproto ${ipproto} || return $ksft_skip + run_cmd ${ns_a} ip link add ${encap}_a type ${type} ${mode} local ${a_addr} remote ${b_addr} encap ${encap} encap-sport auto encap-dport 5556 || return $ksft_skip run_cmd ${ns_b} ip fou add port 5556 ipproto ${ipproto} run_cmd ${ns_b} ip link add ${encap}_b type ${type} ${mode} local ${b_addr} remote ${a_addr} encap ${encap} encap-sport auto encap-dport 5555 @@ -455,7 +455,7 @@ setup_ipvX_over_ipvY() { fi fi - run_cmd ${ns_a} ip link add ip_a type ${type} local ${a_addr} remote ${b_addr} mode ${mode} || return 2 + run_cmd ${ns_a} ip link add ip_a type ${type} local ${a_addr} remote ${b_addr} mode ${mode} || return $ksft_skip run_cmd ${ns_b} ip link add ip_b type ${type} local ${b_addr} remote ${a_addr} mode ${mode} run_cmd ${ns_a} ip link set ip_a up @@ -713,7 +713,7 @@ setup_routing() { } setup_bridge() { - run_cmd ${ns_a} ip link add br0 type bridge || return 2 + run_cmd ${ns_a} ip link add br0 type bridge || return $ksft_skip run_cmd ${ns_a} ip link set br0 up run_cmd ${ns_c} ip link add veth_C-A type veth peer name veth_A-C @@ -765,7 +765,7 @@ setup_ovs_vxlan6() { } setup_ovs_bridge() { - run_cmd ovs-vsctl add-br ovs_br0 || return 2 + run_cmd ovs-vsctl add-br ovs_br0 || return $ksft_skip run_cmd ip link set ovs_br0 up run_cmd ${ns_c} ip link add veth_C-A type veth peer name veth_A-C @@ -887,7 +887,7 @@ check_pmtu_value() { test_pmtu_ipvX() { family=${1} - setup namespaces routing || return 2 + setup namespaces routing || return $ksft_skip trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \ @@ -985,11 +985,11 @@ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() { ll_mtu=4000 if [ ${outer_family} -eq 4 ]; then - setup namespaces routing ${type}4 || return 2 + setup namespaces routing ${type}4 || return $ksft_skip # IPv4 header UDP header VXLAN/GENEVE header Ethernet header exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14)) else - setup namespaces routing ${type}6 || return 2 + setup namespaces routing ${type}6 || return $ksft_skip # IPv6 header UDP header VXLAN/GENEVE header Ethernet header exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14)) fi @@ -1060,11 +1060,11 @@ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() { ll_mtu=4000 if [ ${outer_family} -eq 4 ]; then - setup namespaces routing bridge bridged_${type}4 || return 2 + setup namespaces routing bridge bridged_${type}4 || return $ksft_skip # IPv4 header UDP header VXLAN/GENEVE header Ethernet header exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14)) else - setup namespaces routing bridge bridged_${type}6 || return 2 + setup namespaces routing bridge bridged_${type}6 || return $ksft_skip # IPv6 header UDP header VXLAN/GENEVE header Ethernet header exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14)) fi @@ -1144,11 +1144,11 @@ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception() { ll_mtu=4000 if [ ${outer_family} -eq 4 ]; then - setup namespaces routing ovs_bridge ovs_${type}4 || return 2 + setup namespaces routing ovs_bridge ovs_${type}4 || return $ksft_skip # IPv4 header UDP header VXLAN/GENEVE header Ethernet header exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14)) else - setup namespaces routing ovs_bridge ovs_${type}6 || return 2 + setup namespaces routing ovs_bridge ovs_${type}6 || return $ksft_skip # IPv6 header UDP header VXLAN/GENEVE header Ethernet header exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14)) fi @@ -1230,7 +1230,7 @@ test_pmtu_ipvX_over_fouY_or_gueY() { encap=${3} ll_mtu=4000 - setup namespaces routing ${encap}${outer_family}${inner_family} || return 2 + setup namespaces routing ${encap}${outer_family}${inner_family} || return $ksft_skip trace "${ns_a}" ${encap}_a "${ns_b}" ${encap}_b \ "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B @@ -1309,7 +1309,7 @@ test_pmtu_ipvX_over_ipvY_exception() { outer=${2} ll_mtu=4000 - setup namespaces routing ip${inner}ip${outer} || return 2 + setup namespaces routing ip${inner}ip${outer} || return $ksft_skip trace "${ns_a}" ip_a "${ns_b}" ip_b \ "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ @@ -1363,7 +1363,7 @@ test_pmtu_ipv6_ipv6_exception() { } test_pmtu_vti4_exception() { - setup namespaces veth vti4 xfrm4 || return 2 + setup namespaces veth vti4 xfrm4 || return $ksft_skip trace "${ns_a}" veth_a "${ns_b}" veth_b \ "${ns_a}" vti4_a "${ns_b}" vti4_b @@ -1393,7 +1393,7 @@ test_pmtu_vti4_exception() { } test_pmtu_vti6_exception() { - setup namespaces veth vti6 xfrm6 || return 2 + setup namespaces veth vti6 xfrm6 || return $ksft_skip trace "${ns_a}" veth_a "${ns_b}" veth_b \ "${ns_a}" vti6_a "${ns_b}" vti6_b fail=0 @@ -1423,7 +1423,7 @@ test_pmtu_vti6_exception() { } test_pmtu_vti4_default_mtu() { - setup namespaces veth vti4 || return 2 + setup namespaces veth vti4 || return $ksft_skip # Check that MTU of vti device is MTU of veth minus IPv4 header length veth_mtu="$(link_get_mtu "${ns_a}" veth_a)" @@ -1435,7 +1435,7 @@ test_pmtu_vti4_default_mtu() { } test_pmtu_vti6_default_mtu() { - setup namespaces veth vti6 || return 2 + setup namespaces veth vti6 || return $ksft_skip # Check that MTU of vti device is MTU of veth minus IPv6 header length veth_mtu="$(link_get_mtu "${ns_a}" veth_a)" @@ -1447,10 +1447,10 @@ test_pmtu_vti6_default_mtu() { } test_pmtu_vti4_link_add_mtu() { - setup namespaces || return 2 + setup namespaces || return $ksft_skip run_cmd ${ns_a} ip link add vti4_a type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10 - [ $? -ne 0 ] && err " vti not supported" && return 2 + [ $? -ne 0 ] && err " vti not supported" && return $ksft_skip run_cmd ${ns_a} ip link del vti4_a fail=0 @@ -1485,10 +1485,10 @@ test_pmtu_vti4_link_add_mtu() { } test_pmtu_vti6_link_add_mtu() { - setup namespaces || return 2 + setup namespaces || return $ksft_skip run_cmd ${ns_a} ip link add vti6_a type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10 - [ $? -ne 0 ] && err " vti6 not supported" && return 2 + [ $? -ne 0 ] && err " vti6 not supported" && return $ksft_skip run_cmd ${ns_a} ip link del vti6_a fail=0 @@ -1523,10 +1523,10 @@ test_pmtu_vti6_link_add_mtu() { } test_pmtu_vti6_link_change_mtu() { - setup namespaces || return 2 + setup namespaces || return $ksft_skip run_cmd ${ns_a} ip link add dummy0 mtu 1500 type dummy - [ $? -ne 0 ] && err " dummy not supported" && return 2 + [ $? -ne 0 ] && err " dummy not supported" && return $ksft_skip run_cmd ${ns_a} ip link add dummy1 mtu 3000 type dummy run_cmd ${ns_a} ip link set dummy0 up run_cmd ${ns_a} ip link set dummy1 up @@ -1579,10 +1579,10 @@ test_cleanup_vxlanX_exception() { encap="vxlan" ll_mtu=4000 - check_command taskset || return 2 + check_command taskset || return $ksft_skip cpu_list=$(grep -m 2 processor /proc/cpuinfo | cut -d ' ' -f 2) - setup namespaces routing ${encap}${outer} || return 2 + setup namespaces routing ${encap}${outer} || return $ksft_skip trace "${ns_a}" ${encap}_a "${ns_b}" ${encap}_b \ "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B @@ -1644,7 +1644,7 @@ run_test() { fi err_flush exit 1 - elif [ $ret -eq 2 ]; then + elif [ $ret -eq $ksft_skip ]; then printf "TEST: %-60s [SKIP]\n" "${tdesc}" err_flush fi @@ -1652,7 +1652,19 @@ run_test() { return $ret ) ret=$? - [ $ret -ne 0 ] && exitcode=1 + case $ret in + 0) + all_skipped=false + [ $exitcode=$ksft_skip ] && exitcode=0 + ;; + $ksft_skip) + [ $all_skipped = true ] && exitcode=$ksft_skip + ;; + *) + all_skipped=false + exitcode=1 + ;; + esac return $ret } @@ -1667,7 +1679,7 @@ run_test_nh() { } test_list_flush_ipv4_exception() { - setup namespaces routing || return 2 + setup namespaces routing || return $ksft_skip trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \ @@ -1721,7 +1733,7 @@ test_list_flush_ipv4_exception() { } test_list_flush_ipv6_exception() { - setup namespaces routing || return 2 + setup namespaces routing || return $ksft_skip trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \ @@ -1786,6 +1798,7 @@ usage() { # exitcode=0 desc=0 +all_skipped=true while getopts :ptv o do @@ -1840,7 +1853,7 @@ for t in ${tests}; do if [ $run_this -eq 1 ]; then run_test "${name}" "${desc}" # if test was skipped no need to retry with nexthop objects - [ $? -eq 2 ] && rerun_nh=0 + [ $? -eq $ksft_skip ] && rerun_nh=0 if [ "${rerun_nh}" = "1" ]; then run_test_nh "${name}" "${desc}" diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c index 2c522f7a0aec..db4521335722 100644 --- a/tools/testing/selftests/net/psock_fanout.c +++ b/tools/testing/selftests/net/psock_fanout.c @@ -56,12 +56,15 @@ #define RING_NUM_FRAMES 20 +static uint32_t cfg_max_num_members; + /* Open a socket in a given fanout mode. * @return -1 if mode is bad, a valid socket otherwise */ static int sock_fanout_open(uint16_t typeflags, uint16_t group_id) { struct sockaddr_ll addr = {0}; - int fd, val; + struct fanout_args args; + int fd, val, err; fd = socket(PF_PACKET, SOCK_RAW, 0); if (fd < 0) { @@ -83,8 +86,18 @@ static int sock_fanout_open(uint16_t typeflags, uint16_t group_id) exit(1); } - val = (((int) typeflags) << 16) | group_id; - if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT, &val, sizeof(val))) { + if (cfg_max_num_members) { + args.id = group_id; + args.type_flags = typeflags; + args.max_num_members = cfg_max_num_members; + err = setsockopt(fd, SOL_PACKET, PACKET_FANOUT, &args, + sizeof(args)); + } else { + val = (((int) typeflags) << 16) | group_id; + err = setsockopt(fd, SOL_PACKET, PACKET_FANOUT, &val, + sizeof(val)); + } + if (err) { if (close(fd)) { perror("close packet"); exit(1); @@ -286,6 +299,56 @@ static void test_control_group(void) } } +/* Test illegal max_num_members values */ +static void test_control_group_max_num_members(void) +{ + int fds[3]; + + fprintf(stderr, "test: control multiple sockets, max_num_members\n"); + + /* expected failure on greater than PACKET_FANOUT_MAX */ + cfg_max_num_members = (1 << 16) + 1; + if (sock_fanout_open(PACKET_FANOUT_HASH, 0) != -1) { + fprintf(stderr, "ERROR: max_num_members > PACKET_FANOUT_MAX\n"); + exit(1); + } + + cfg_max_num_members = 256; + fds[0] = sock_fanout_open(PACKET_FANOUT_HASH, 0); + if (fds[0] == -1) { + fprintf(stderr, "ERROR: failed open\n"); + exit(1); + } + + /* expected failure on joining group with different max_num_members */ + cfg_max_num_members = 257; + if (sock_fanout_open(PACKET_FANOUT_HASH, 0) != -1) { + fprintf(stderr, "ERROR: set different max_num_members\n"); + exit(1); + } + + /* success on joining group with same max_num_members */ + cfg_max_num_members = 256; + fds[1] = sock_fanout_open(PACKET_FANOUT_HASH, 0); + if (fds[1] == -1) { + fprintf(stderr, "ERROR: failed to join group\n"); + exit(1); + } + + /* success on joining group with max_num_members unspecified */ + cfg_max_num_members = 0; + fds[2] = sock_fanout_open(PACKET_FANOUT_HASH, 0); + if (fds[2] == -1) { + fprintf(stderr, "ERROR: failed to join group\n"); + exit(1); + } + + if (close(fds[2]) || close(fds[1]) || close(fds[0])) { + fprintf(stderr, "ERROR: closing sockets\n"); + exit(1); + } +} + /* Test creating a unique fanout group ids */ static void test_unique_fanout_group_ids(void) { @@ -426,8 +489,11 @@ int main(int argc, char **argv) test_control_single(); test_control_group(); + test_control_group_max_num_members(); test_unique_fanout_group_ids(); + /* PACKET_FANOUT_MAX */ + cfg_max_num_members = 1 << 16; /* find a set of ports that do not collide onto the same socket */ ret = test_datapath(PACKET_FANOUT_HASH, port_off, expect_hash[0], expect_hash[1]); diff --git a/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh new file mode 100755 index 000000000000..ad7a9fc59934 --- /dev/null +++ b/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh @@ -0,0 +1,494 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# author: Andrea Mayer <andrea.mayer@uniroma2.it> + +# This test is designed for evaluating the new SRv6 End.DT4 behavior used for +# implementing IPv4 L3 VPN use cases. +# +# Hereafter a network diagram is shown, where two different tenants (named 100 +# and 200) offer IPv4 L3 VPN services allowing hosts to communicate with each +# other across an IPv6 network. +# +# Only hosts belonging to the same tenant (and to the same VPN) can communicate +# with each other. Instead, the communication among hosts of different tenants +# is forbidden. +# In other words, hosts hs-t100-1 and hs-t100-2 are connected through the IPv4 +# L3 VPN of tenant 100 while hs-t200-3 and hs-t200-4 are connected using the +# IPv4 L3 VPN of tenant 200. Cross connection between tenant 100 and tenant 200 +# is forbidden and thus, for example, hs-t100-1 cannot reach hs-t200-3 and vice +# versa. +# +# Routers rt-1 and rt-2 implement IPv4 L3 VPN services leveraging the SRv6 +# architecture. The key components for such VPNs are: a) SRv6 Encap behavior, +# b) SRv6 End.DT4 behavior and c) VRF. +# +# To explain how an IPv4 L3 VPN based on SRv6 works, let us briefly consider an +# example where, within the same domain of tenant 100, the host hs-t100-1 pings +# the host hs-t100-2. +# +# First of all, L2 reachability of the host hs-t100-2 is taken into account by +# the router rt-1 which acts as an arp proxy. +# +# When the host hs-t100-1 sends an IPv4 packet destined to hs-t100-2, the +# router rt-1 receives the packet on the internal veth-t100 interface. Such +# interface is enslaved to the VRF vrf-100 whose associated table contains the +# SRv6 Encap route for encapsulating any IPv4 packet in a IPv6 plus the Segment +# Routing Header (SRH) packet. This packet is sent through the (IPv6) core +# network up to the router rt-2 that receives it on veth0 interface. +# +# The rt-2 router uses the 'localsid' routing table to process incoming +# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these +# packets, the SRv6 End.DT4 behavior removes the outer IPv6+SRH headers and +# performs the lookup on the vrf-100 table using the destination address of +# the decapsulated IPv4 packet. Afterwards, the packet is sent to the host +# hs-t100-2 through the veth-t100 interface. +# +# The ping response follows the same processing but this time the role of rt-1 +# and rt-2 are swapped. +# +# Of course, the IPv4 L3 VPN for tenant 200 works exactly as the IPv4 L3 VPN +# for tenant 100. In this case, only hosts hs-t200-3 and hs-t200-4 are able to +# connect with each other. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | hs-t100-1 netns | | hs-t100-2 netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | 10.0.0.1/24 | | | | 10.0.0.2/24 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +-----------------------------------+ +-----------------------------------+ +# | . | | . | +# | +---------------+ | | +---------------- | +# | | veth-t100 | | | | veth-t100 | | +# | | 10.0.0.254/24 | +----------+ | | +----------+ | 10.0.0.254/24 | | +# | +-------+-------+ | localsid | | | | localsid | +-------+-------- | +# | | | table | | | | table | | | +# | +----+----+ +----------+ | | +----------+ +----+----+ | +# | | vrf-100 | | | | vrf-100 | | +# | +---------+ +------------+ | | +------------+ +---------+ | +# | | veth0 | | | | veth0 | | +# | | fd00::1/64 |.|...|.| fd00::2/64 | | +# | +---------+ +------------+ | | +------------+ +---------+ | +# | | vrf-200 | | | | vrf-200 | | +# | +----+----+ | | +----+----+ | +# | | | | | | +# | +-------+-------+ | | +-------+-------- | +# | | veth-t200 | | | | veth-t200 | | +# | | 10.0.0.254/24 | | | | 10.0.0.254/24 | | +# | +---------------+ rt-1 netns | | rt-2 netns +---------------- | +# | . | | . | +# +-----------------------------------+ +-----------------------------------+ +# . . +# . . +# . . +# . . +# +-------------------+ +-------------------+ +# | . | | . | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | 10.0.0.3/24 | | | | 10.0.0.4/24 | | +# | +-------------+ | | +-------------+ | +# | | | | +# | hs-t200-3 netns | | hs-t200-4 netns | +# | | | | +# +-------------------+ +-------------------+ +# +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# | Network configuration | +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# rt-1: localsid table (table 90) +# +-------------------------------------------------+ +# |SID |Action | +# +-------------------------------------------------+ +# |fc00:21:100::6004|apply SRv6 End.DT4 vrftable 100| +# +-------------------------------------------------+ +# |fc00:21:200::6004|apply SRv6 End.DT4 vrftable 200| +# +-------------------------------------------------+ +# +# rt-1: VRF tenant 100 (table 100) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |10.0.0.2 |apply seg6 encap segs fc00:12:100::6004| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth_t100 | +# +---------------------------------------------------+ +# +# rt-1: VRF tenant 200 (table 200) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |10.0.0.4 |apply seg6 encap segs fc00:12:200::6004| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth_t200 | +# +---------------------------------------------------+ +# +# +# rt-2: localsid table (table 90) +# +-------------------------------------------------+ +# |SID |Action | +# +-------------------------------------------------+ +# |fc00:12:100::6004|apply SRv6 End.DT4 vrftable 100| +# +-------------------------------------------------+ +# |fc00:12:200::6004|apply SRv6 End.DT4 vrftable 200| +# +-------------------------------------------------+ +# +# rt-2: VRF tenant 100 (table 100) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |10.0.0.1 |apply seg6 encap segs fc00:21:100::6004| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth_t100 | +# +---------------------------------------------------+ +# +# rt-2: VRF tenant 200 (table 200) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |10.0.0.3 |apply seg6 encap segs fc00:21:200::6004| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth_t200 | +# +---------------------------------------------------+ +# + +readonly LOCALSID_TABLE_ID=90 +readonly IPv6_RT_NETWORK=fd00 +readonly IPv4_HS_NETWORK=10.0.0 +readonly VPN_LOCATOR_SERVICE=fc00 +PING_TIMEOUT_SEC=4 + +ret=0 + +PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + nsuccess=$((nsuccess+1)) + printf "\n TEST: %-60s [ OK ]\n" "${msg}" + else + ret=1 + nfail=$((nfail+1)) + printf "\n TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +print_log_test_results() +{ + if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} + fi +} + +log_section() +{ + echo + echo "################################################################################" + echo "TEST SECTION: $*" + echo "################################################################################" +} + +cleanup() +{ + ip link del veth-rt-1 2>/dev/null || true + ip link del veth-rt-2 2>/dev/null || true + + # destroy routers rt-* and hosts hs-* + for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do + ip netns del ${ns} || true + done +} + +# Setup the basic networking for the routers +setup_rt_networking() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns add ${nsname} + ip link set veth-rt-${rt} netns ${nsname} + ip -netns ${nsname} link set veth-rt-${rt} name veth0 + + ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 + ip -netns ${nsname} link set veth0 up + ip -netns ${nsname} link set lo up + + ip netns exec ${nsname} sysctl -wq net.ipv4.ip_forward=1 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1 +} + +setup_hs() +{ + local hs=$1 + local rt=$2 + local tid=$3 + local hsname=hs-t${tid}-${hs} + local rtname=rt-${rt} + local rtveth=veth-t${tid} + + # set the networking for the host + ip netns add ${hsname} + ip -netns ${hsname} link add veth0 type veth peer name ${rtveth} + ip -netns ${hsname} link set ${rtveth} netns ${rtname} + ip -netns ${hsname} addr add ${IPv4_HS_NETWORK}.${hs}/24 dev veth0 + ip -netns ${hsname} link set veth0 up + ip -netns ${hsname} link set lo up + + # configure the VRF for the tenant X on the router which is directly + # connected to the source host. + ip -netns ${rtname} link add vrf-${tid} type vrf table ${tid} + ip -netns ${rtname} link set vrf-${tid} up + + # enslave the veth-tX interface to the vrf-X in the access router + ip -netns ${rtname} link set ${rtveth} master vrf-${tid} + ip -netns ${rtname} addr add ${IPv4_HS_NETWORK}.254/24 dev ${rtveth} + ip -netns ${rtname} link set ${rtveth} up + + ip netns exec ${rtname} sysctl -wq net.ipv4.conf.${rtveth}.proxy_arp=1 + + # disable the rp_filter otherwise the kernel gets confused about how + # to route decap ipv4 packets. + ip netns exec ${rtname} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${rtname} sysctl -wq net.ipv4.conf.${rtveth}.rp_filter=0 + + ip netns exec ${rtname} sh -c "echo 1 > /proc/sys/net/vrf/strict_mode" +} + +setup_vpn_config() +{ + local hssrc=$1 + local rtsrc=$2 + local hsdst=$3 + local rtdst=$4 + local tid=$5 + + local hssrc_name=hs-t${tid}-${hssrc} + local hsdst_name=hs-t${tid}-${hsdst} + local rtsrc_name=rt-${rtsrc} + local rtdst_name=rt-${rtdst} + local vpn_sid=${VPN_LOCATOR_SERVICE}:${hssrc}${hsdst}:${tid}::6004 + + # set the encap route for encapsulating packets which arrive from the + # host hssrc and destined to the access router rtsrc. + ip -netns ${rtsrc_name} -4 route add ${IPv4_HS_NETWORK}.${hsdst}/32 vrf vrf-${tid} \ + encap seg6 mode encap segs ${vpn_sid} dev veth0 + ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 vrf vrf-${tid} \ + via fd00::${rtdst} dev veth0 + + # set the decap route for decapsulating packets which arrive from + # the rtdst router and destined to the hsdst host. + ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 table ${LOCALSID_TABLE_ID} \ + encap seg6local action End.DT4 vrftable ${tid} dev vrf-${tid} + + # all sids for VPNs start with a common locator which is fc00::/16. + # Routes for handling the SRv6 End.DT4 behavior instances are grouped + # together in the 'localsid' table. + # + # NOTE: added only once + if [ -z "$(ip -netns ${rtdst_name} -6 rule show | \ + grep "to ${VPN_LOCATOR_SERVICE}::/16 lookup ${LOCALSID_TABLE_ID}")" ]; then + ip -netns ${rtdst_name} -6 rule add \ + to ${VPN_LOCATOR_SERVICE}::/16 \ + lookup ${LOCALSID_TABLE_ID} prio 999 + fi +} + +setup() +{ + ip link add veth-rt-1 type veth peer name veth-rt-2 + # setup the networking for router rt-1 and router rt-2 + setup_rt_networking 1 + setup_rt_networking 2 + + # setup two hosts for the tenant 100. + # - host hs-1 is directly connected to the router rt-1; + # - host hs-2 is directly connected to the router rt-2. + setup_hs 1 1 100 #args: host router tenant + setup_hs 2 2 100 + + # setup two hosts for the tenant 200 + # - host hs-3 is directly connected to the router rt-1; + # - host hs-4 is directly connected to the router rt-2. + setup_hs 3 1 200 + setup_hs 4 2 200 + + # setup the IPv4 L3 VPN which connects the host hs-t100-1 and host + # hs-t100-2 within the same tenant 100. + setup_vpn_config 1 1 2 2 100 #args: src_host src_router dst_host dst_router tenant + setup_vpn_config 2 2 1 1 100 + + # setup the IPv4 L3 VPN which connects the host hs-t200-3 and host + # hs-t200-4 within the same tenant 200. + setup_vpn_config 3 1 4 2 200 + setup_vpn_config 4 2 3 1 200 +} + +check_rt_connectivity() +{ + local rtsrc=$1 + local rtdst=$2 + + ip netns exec rt-${rtsrc} ping -c 1 -W 1 ${IPv6_RT_NETWORK}::${rtdst} \ + >/dev/null 2>&1 +} + +check_and_log_rt_connectivity() +{ + local rtsrc=$1 + local rtdst=$2 + + check_rt_connectivity ${rtsrc} ${rtdst} + log_test $? 0 "Routers connectivity: rt-${rtsrc} -> rt-${rtdst}" +} + +check_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + ip netns exec hs-t${tid}-${hssrc} ping -c 1 -W ${PING_TIMEOUT_SEC} \ + ${IPv4_HS_NETWORK}.${hsdst} >/dev/null 2>&1 +} + +check_and_log_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + check_hs_connectivity ${hssrc} ${hsdst} ${tid} + log_test $? 0 "Hosts connectivity: hs-t${tid}-${hssrc} -> hs-t${tid}-${hsdst} (tenant ${tid})" +} + +check_and_log_hs_isolation() +{ + local hssrc=$1 + local tidsrc=$2 + local hsdst=$3 + local tiddst=$4 + + check_hs_connectivity ${hssrc} ${hsdst} ${tidsrc} + # NOTE: ping should fail + log_test $? 1 "Hosts isolation: hs-t${tidsrc}-${hssrc} -X-> hs-t${tiddst}-${hsdst}" +} + + +check_and_log_hs2gw_connectivity() +{ + local hssrc=$1 + local tid=$2 + + check_hs_connectivity ${hssrc} 254 ${tid} + log_test $? 0 "Hosts connectivity: hs-t${tid}-${hssrc} -> gw (tenant ${tid})" +} + +router_tests() +{ + log_section "IPv6 routers connectivity test" + + check_and_log_rt_connectivity 1 2 + check_and_log_rt_connectivity 2 1 +} + +host2gateway_tests() +{ + log_section "IPv4 connectivity test among hosts and gateway" + + check_and_log_hs2gw_connectivity 1 100 + check_and_log_hs2gw_connectivity 2 100 + + check_and_log_hs2gw_connectivity 3 200 + check_and_log_hs2gw_connectivity 4 200 +} + +host_vpn_tests() +{ + log_section "SRv6 VPN connectivity test among hosts in the same tenant" + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 + + check_and_log_hs_connectivity 3 4 200 + check_and_log_hs_connectivity 4 3 200 +} + +host_vpn_isolation_tests() +{ + local i + local j + local k + local tmp + local l1="1 2" + local l2="3 4" + local t1=100 + local t2=200 + + log_section "SRv6 VPN isolation test among hosts in different tentants" + + for k in 0 1; do + for i in ${l1}; do + for j in ${l2}; do + check_and_log_hs_isolation ${i} ${t1} ${j} ${t2} + done + done + + # let us test the reverse path + tmp="${l1}"; l1="${l2}"; l2="${tmp}" + tmp=${t1}; t1=${t2}; t2=${tmp} + done +} + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit 0 +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit 0 +fi + +modprobe vrf &>/dev/null +if [ ! -e /proc/sys/net/vrf/strict_mode ]; then + echo "SKIP: vrf sysctl does not exist" + exit 0 +fi + +cleanup &>/dev/null + +setup + +router_tests +host2gateway_tests +host_vpn_tests +host_vpn_isolation_tests + +print_log_test_results + +cleanup &>/dev/null + +exit ${ret} diff --git a/tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh new file mode 100755 index 000000000000..68708f5e26a0 --- /dev/null +++ b/tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh @@ -0,0 +1,502 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# author: Andrea Mayer <andrea.mayer@uniroma2.it> +# author: Paolo Lungaroni <paolo.lungaroni@cnit.it> + +# This test is designed for evaluating the new SRv6 End.DT6 behavior used for +# implementing IPv6 L3 VPN use cases. +# +# Hereafter a network diagram is shown, where two different tenants (named 100 +# and 200) offer IPv6 L3 VPN services allowing hosts to communicate with each +# other across an IPv6 network. +# +# Only hosts belonging to the same tenant (and to the same VPN) can communicate +# with each other. Instead, the communication among hosts of different tenants +# is forbidden. +# In other words, hosts hs-t100-1 and hs-t100-2 are connected through the IPv6 +# L3 VPN of tenant 100 while hs-t200-3 and hs-t200-4 are connected using the +# IPv6 L3 VPN of tenant 200. Cross connection between tenant 100 and tenant 200 +# is forbidden and thus, for example, hs-t100-1 cannot reach hs-t200-3 and vice +# versa. +# +# Routers rt-1 and rt-2 implement IPv6 L3 VPN services leveraging the SRv6 +# architecture. The key components for such VPNs are: a) SRv6 Encap behavior, +# b) SRv6 End.DT6 behavior and c) VRF. +# +# To explain how an IPv6 L3 VPN based on SRv6 works, let us briefly consider an +# example where, within the same domain of tenant 100, the host hs-t100-1 pings +# the host hs-t100-2. +# +# First of all, L2 reachability of the host hs-t100-2 is taken into account by +# the router rt-1 which acts as a ndp proxy. +# +# When the host hs-t100-1 sends an IPv6 packet destined to hs-t100-2, the +# router rt-1 receives the packet on the internal veth-t100 interface. Such +# interface is enslaved to the VRF vrf-100 whose associated table contains the +# SRv6 Encap route for encapsulating any IPv6 packet in a IPv6 plus the Segment +# Routing Header (SRH) packet. This packet is sent through the (IPv6) core +# network up to the router rt-2 that receives it on veth0 interface. +# +# The rt-2 router uses the 'localsid' routing table to process incoming +# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these +# packets, the SRv6 End.DT6 behavior removes the outer IPv6+SRH headers and +# performs the lookup on the vrf-100 table using the destination address of +# the decapsulated IPv6 packet. Afterwards, the packet is sent to the host +# hs-t100-2 through the veth-t100 interface. +# +# The ping response follows the same processing but this time the role of rt-1 +# and rt-2 are swapped. +# +# Of course, the IPv6 L3 VPN for tenant 200 works exactly as the IPv6 L3 VPN +# for tenant 100. In this case, only hosts hs-t200-3 and hs-t200-4 are able to +# connect with each other. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | hs-t100-1 netns | | hs-t100-2 netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | cafe::1/64 | | | | cafe::2/64 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +-----------------------------------+ +-----------------------------------+ +# | . | | . | +# | +---------------+ | | +---------------- | +# | | veth-t100 | | | | veth-t100 | | +# | | cafe::254/64 | +----------+ | | +----------+ | cafe::254/64 | | +# | +-------+-------+ | localsid | | | | localsid | +-------+-------- | +# | | | table | | | | table | | | +# | +----+----+ +----------+ | | +----------+ +----+----+ | +# | | vrf-100 | | | | vrf-100 | | +# | +---------+ +------------+ | | +------------+ +---------+ | +# | | veth0 | | | | veth0 | | +# | | fd00::1/64 |.|...|.| fd00::2/64 | | +# | +---------+ +------------+ | | +------------+ +---------+ | +# | | vrf-200 | | | | vrf-200 | | +# | +----+----+ | | +----+----+ | +# | | | | | | +# | +-------+-------+ | | +-------+-------- | +# | | veth-t200 | | | | veth-t200 | | +# | | cafe::254/64 | | | | cafe::254/64 | | +# | +---------------+ rt-1 netns | | rt-2 netns +---------------- | +# | . | | . | +# +-----------------------------------+ +-----------------------------------+ +# . . +# . . +# . . +# . . +# +-------------------+ +-------------------+ +# | . | | . | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | cafe::3/64 | | | | cafe::4/64 | | +# | +-------------+ | | +-------------+ | +# | | | | +# | hs-t200-3 netns | | hs-t200-4 netns | +# | | | | +# +-------------------+ +-------------------+ +# +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# | Network configuration | +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# rt-1: localsid table (table 90) +# +-------------------------------------------------+ +# |SID |Action | +# +-------------------------------------------------+ +# |fc00:21:100::6006|apply SRv6 End.DT6 vrftable 100| +# +-------------------------------------------------+ +# |fc00:21:200::6006|apply SRv6 End.DT6 vrftable 200| +# +-------------------------------------------------+ +# +# rt-1: VRF tenant 100 (table 100) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::2 |apply seg6 encap segs fc00:12:100::6006| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth_t100 | +# +---------------------------------------------------+ +# +# rt-1: VRF tenant 200 (table 200) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::4 |apply seg6 encap segs fc00:12:200::6006| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth_t200 | +# +---------------------------------------------------+ +# +# +# rt-2: localsid table (table 90) +# +-------------------------------------------------+ +# |SID |Action | +# +-------------------------------------------------+ +# |fc00:12:100::6006|apply SRv6 End.DT6 vrftable 100| +# +-------------------------------------------------+ +# |fc00:12:200::6006|apply SRv6 End.DT6 vrftable 200| +# +-------------------------------------------------+ +# +# rt-2: VRF tenant 100 (table 100) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::1 |apply seg6 encap segs fc00:21:100::6006| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth_t100 | +# +---------------------------------------------------+ +# +# rt-2: VRF tenant 200 (table 200) +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::3 |apply seg6 encap segs fc00:21:200::6006| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth_t200 | +# +---------------------------------------------------+ +# + +readonly LOCALSID_TABLE_ID=90 +readonly IPv6_RT_NETWORK=fd00 +readonly IPv6_HS_NETWORK=cafe +readonly VPN_LOCATOR_SERVICE=fc00 +PING_TIMEOUT_SEC=4 + +ret=0 + +PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + nsuccess=$((nsuccess+1)) + printf "\n TEST: %-60s [ OK ]\n" "${msg}" + else + ret=1 + nfail=$((nfail+1)) + printf "\n TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +print_log_test_results() +{ + if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} + fi +} + +log_section() +{ + echo + echo "################################################################################" + echo "TEST SECTION: $*" + echo "################################################################################" +} + +cleanup() +{ + ip link del veth-rt-1 2>/dev/null || true + ip link del veth-rt-2 2>/dev/null || true + + # destroy routers rt-* and hosts hs-* + for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do + ip netns del ${ns} || true + done +} + +# Setup the basic networking for the routers +setup_rt_networking() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns add ${nsname} + ip link set veth-rt-${rt} netns ${nsname} + ip -netns ${nsname} link set veth-rt-${rt} name veth0 + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 nodad + ip -netns ${nsname} link set veth0 up + ip -netns ${nsname} link set lo up + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1 +} + +setup_hs() +{ + local hs=$1 + local rt=$2 + local tid=$3 + local hsname=hs-t${tid}-${hs} + local rtname=rt-${rt} + local rtveth=veth-t${tid} + + # set the networking for the host + ip netns add ${hsname} + + ip netns exec ${hsname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${hsname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip -netns ${hsname} link add veth0 type veth peer name ${rtveth} + ip -netns ${hsname} link set ${rtveth} netns ${rtname} + ip -netns ${hsname} addr add ${IPv6_HS_NETWORK}::${hs}/64 dev veth0 nodad + ip -netns ${hsname} link set veth0 up + ip -netns ${hsname} link set lo up + + # configure the VRF for the tenant X on the router which is directly + # connected to the source host. + ip -netns ${rtname} link add vrf-${tid} type vrf table ${tid} + ip -netns ${rtname} link set vrf-${tid} up + + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + # enslave the veth-tX interface to the vrf-X in the access router + ip -netns ${rtname} link set ${rtveth} master vrf-${tid} + ip -netns ${rtname} addr add ${IPv6_HS_NETWORK}::254/64 dev ${rtveth} nodad + ip -netns ${rtname} link set ${rtveth} up + + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.${rtveth}.proxy_ndp=1 + + ip netns exec ${rtname} sh -c "echo 1 > /proc/sys/net/vrf/strict_mode" +} + +setup_vpn_config() +{ + local hssrc=$1 + local rtsrc=$2 + local hsdst=$3 + local rtdst=$4 + local tid=$5 + + local hssrc_name=hs-t${tid}-${hssrc} + local hsdst_name=hs-t${tid}-${hsdst} + local rtsrc_name=rt-${rtsrc} + local rtdst_name=rt-${rtdst} + local rtveth=veth-t${tid} + local vpn_sid=${VPN_LOCATOR_SERVICE}:${hssrc}${hsdst}:${tid}::6006 + + ip -netns ${rtsrc_name} -6 neigh add proxy ${IPv6_HS_NETWORK}::${hsdst} dev ${rtveth} + + # set the encap route for encapsulating packets which arrive from the + # host hssrc and destined to the access router rtsrc. + ip -netns ${rtsrc_name} -6 route add ${IPv6_HS_NETWORK}::${hsdst}/128 vrf vrf-${tid} \ + encap seg6 mode encap segs ${vpn_sid} dev veth0 + ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 vrf vrf-${tid} \ + via fd00::${rtdst} dev veth0 + + # set the decap route for decapsulating packets which arrive from + # the rtdst router and destined to the hsdst host. + ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 table ${LOCALSID_TABLE_ID} \ + encap seg6local action End.DT6 vrftable ${tid} dev vrf-${tid} + + # all sids for VPNs start with a common locator which is fc00::/16. + # Routes for handling the SRv6 End.DT6 behavior instances are grouped + # together in the 'localsid' table. + # + # NOTE: added only once + if [ -z "$(ip -netns ${rtdst_name} -6 rule show | \ + grep "to ${VPN_LOCATOR_SERVICE}::/16 lookup ${LOCALSID_TABLE_ID}")" ]; then + ip -netns ${rtdst_name} -6 rule add \ + to ${VPN_LOCATOR_SERVICE}::/16 \ + lookup ${LOCALSID_TABLE_ID} prio 999 + fi +} + +setup() +{ + ip link add veth-rt-1 type veth peer name veth-rt-2 + # setup the networking for router rt-1 and router rt-2 + setup_rt_networking 1 + setup_rt_networking 2 + + # setup two hosts for the tenant 100. + # - host hs-1 is directly connected to the router rt-1; + # - host hs-2 is directly connected to the router rt-2. + setup_hs 1 1 100 #args: host router tenant + setup_hs 2 2 100 + + # setup two hosts for the tenant 200 + # - host hs-3 is directly connected to the router rt-1; + # - host hs-4 is directly connected to the router rt-2. + setup_hs 3 1 200 + setup_hs 4 2 200 + + # setup the IPv6 L3 VPN which connects the host hs-t100-1 and host + # hs-t100-2 within the same tenant 100. + setup_vpn_config 1 1 2 2 100 #args: src_host src_router dst_host dst_router tenant + setup_vpn_config 2 2 1 1 100 + + # setup the IPv6 L3 VPN which connects the host hs-t200-3 and host + # hs-t200-4 within the same tenant 200. + setup_vpn_config 3 1 4 2 200 + setup_vpn_config 4 2 3 1 200 +} + +check_rt_connectivity() +{ + local rtsrc=$1 + local rtdst=$2 + + ip netns exec rt-${rtsrc} ping -c 1 -W 1 ${IPv6_RT_NETWORK}::${rtdst} \ + >/dev/null 2>&1 +} + +check_and_log_rt_connectivity() +{ + local rtsrc=$1 + local rtdst=$2 + + check_rt_connectivity ${rtsrc} ${rtdst} + log_test $? 0 "Routers connectivity: rt-${rtsrc} -> rt-${rtdst}" +} + +check_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + ip netns exec hs-t${tid}-${hssrc} ping -c 1 -W ${PING_TIMEOUT_SEC} \ + ${IPv6_HS_NETWORK}::${hsdst} >/dev/null 2>&1 +} + +check_and_log_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + check_hs_connectivity ${hssrc} ${hsdst} ${tid} + log_test $? 0 "Hosts connectivity: hs-t${tid}-${hssrc} -> hs-t${tid}-${hsdst} (tenant ${tid})" +} + +check_and_log_hs_isolation() +{ + local hssrc=$1 + local tidsrc=$2 + local hsdst=$3 + local tiddst=$4 + + check_hs_connectivity ${hssrc} ${hsdst} ${tidsrc} + # NOTE: ping should fail + log_test $? 1 "Hosts isolation: hs-t${tidsrc}-${hssrc} -X-> hs-t${tiddst}-${hsdst}" +} + + +check_and_log_hs2gw_connectivity() +{ + local hssrc=$1 + local tid=$2 + + check_hs_connectivity ${hssrc} 254 ${tid} + log_test $? 0 "Hosts connectivity: hs-t${tid}-${hssrc} -> gw (tenant ${tid})" +} + +router_tests() +{ + log_section "IPv6 routers connectivity test" + + check_and_log_rt_connectivity 1 2 + check_and_log_rt_connectivity 2 1 +} + +host2gateway_tests() +{ + log_section "IPv6 connectivity test among hosts and gateway" + + check_and_log_hs2gw_connectivity 1 100 + check_and_log_hs2gw_connectivity 2 100 + + check_and_log_hs2gw_connectivity 3 200 + check_and_log_hs2gw_connectivity 4 200 +} + +host_vpn_tests() +{ + log_section "SRv6 VPN connectivity test among hosts in the same tenant" + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 + + check_and_log_hs_connectivity 3 4 200 + check_and_log_hs_connectivity 4 3 200 +} + +host_vpn_isolation_tests() +{ + local i + local j + local k + local tmp + local l1="1 2" + local l2="3 4" + local t1=100 + local t2=200 + + log_section "SRv6 VPN isolation test among hosts in different tentants" + + for k in 0 1; do + for i in ${l1}; do + for j in ${l2}; do + check_and_log_hs_isolation ${i} ${t1} ${j} ${t2} + done + done + + # let us test the reverse path + tmp="${l1}"; l1="${l2}"; l2="${tmp}" + tmp=${t1}; t1=${t2}; t2=${tmp} + done +} + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit 0 +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit 0 +fi + +modprobe vrf &>/dev/null +if [ ! -e /proc/sys/net/vrf/strict_mode ]; then + echo "SKIP: vrf sysctl does not exist" + exit 0 +fi + +cleanup &>/dev/null + +setup + +router_tests +host2gateway_tests +host_vpn_tests +host_vpn_isolation_tests + +print_log_test_results + +cleanup &>/dev/null + +exit ${ret} diff --git a/tools/testing/selftests/net/test_vxlan_under_vrf.sh b/tools/testing/selftests/net/test_vxlan_under_vrf.sh index 09f9ed92cbe4..534c8b7699ab 100755 --- a/tools/testing/selftests/net/test_vxlan_under_vrf.sh +++ b/tools/testing/selftests/net/test_vxlan_under_vrf.sh @@ -50,7 +50,7 @@ cleanup() { ip link del veth-tap 2>/dev/null || true for ns in hv-1 hv-2 vm-1 vm-2; do - ip netns del $ns || true + ip netns del $ns 2>/dev/null || true done } diff --git a/tools/testing/selftests/net/timestamping.c b/tools/testing/selftests/net/timestamping.c index f4bb4fef0f39..21091be70688 100644 --- a/tools/testing/selftests/net/timestamping.c +++ b/tools/testing/selftests/net/timestamping.c @@ -59,7 +59,8 @@ static void usage(const char *error) " SOF_TIMESTAMPING_SOFTWARE - request reporting of software time stamps\n" " SOF_TIMESTAMPING_RAW_HARDWARE - request reporting of raw HW time stamps\n" " SIOCGSTAMP - check last socket time stamp\n" - " SIOCGSTAMPNS - more accurate socket time stamp\n"); + " SIOCGSTAMPNS - more accurate socket time stamp\n" + " PTPV2 - use PTPv2 messages\n"); exit(1); } @@ -115,13 +116,28 @@ static const unsigned char sync[] = { 0x00, 0x00, 0x00, 0x00 }; -static void sendpacket(int sock, struct sockaddr *addr, socklen_t addr_len) +static const unsigned char sync_v2[] = { + 0x00, 0x02, 0x00, 0x2C, + 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0xFF, + 0xFE, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, +}; + +static void sendpacket(int sock, struct sockaddr *addr, socklen_t addr_len, int ptpv2) { + size_t sync_len = ptpv2 ? sizeof(sync_v2) : sizeof(sync); + const void *sync_p = ptpv2 ? sync_v2 : sync; struct timeval now; int res; - res = sendto(sock, sync, sizeof(sync), 0, - addr, addr_len); + res = sendto(sock, sync_p, sync_len, 0, addr, addr_len); gettimeofday(&now, 0); if (res < 0) printf("%s: %s\n", "send", strerror(errno)); @@ -134,9 +150,11 @@ static void sendpacket(int sock, struct sockaddr *addr, socklen_t addr_len) static void printpacket(struct msghdr *msg, int res, char *data, int sock, int recvmsg_flags, - int siocgstamp, int siocgstampns) + int siocgstamp, int siocgstampns, int ptpv2) { struct sockaddr_in *from_addr = (struct sockaddr_in *)msg->msg_name; + size_t sync_len = ptpv2 ? sizeof(sync_v2) : sizeof(sync); + const void *sync_p = ptpv2 ? sync_v2 : sync; struct cmsghdr *cmsg; struct timeval tv; struct timespec ts; @@ -210,10 +228,9 @@ static void printpacket(struct msghdr *msg, int res, "probably SO_EE_ORIGIN_TIMESTAMPING" #endif ); - if (res < sizeof(sync)) + if (res < sync_len) printf(" => truncated data?!"); - else if (!memcmp(sync, data + res - sizeof(sync), - sizeof(sync))) + else if (!memcmp(sync_p, data + res - sync_len, sync_len)) printf(" => GOT OUR DATA BACK (HURRAY!)"); break; } @@ -257,7 +274,7 @@ static void printpacket(struct msghdr *msg, int res, } static void recvpacket(int sock, int recvmsg_flags, - int siocgstamp, int siocgstampns) + int siocgstamp, int siocgstampns, int ptpv2) { char data[256]; struct msghdr msg; @@ -288,7 +305,7 @@ static void recvpacket(int sock, int recvmsg_flags, } else { printpacket(&msg, res, data, sock, recvmsg_flags, - siocgstamp, siocgstampns); + siocgstamp, siocgstampns, ptpv2); } } @@ -300,6 +317,7 @@ int main(int argc, char **argv) int siocgstamp = 0; int siocgstampns = 0; int ip_multicast_loop = 0; + int ptpv2 = 0; char *interface; int i; int enabled = 1; @@ -335,6 +353,8 @@ int main(int argc, char **argv) siocgstampns = 1; else if (!strcasecmp(argv[i], "IP_MULTICAST_LOOP")) ip_multicast_loop = 1; + else if (!strcasecmp(argv[i], "PTPV2")) + ptpv2 = 1; else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_HARDWARE")) so_timestamping_flags |= SOF_TIMESTAMPING_TX_HARDWARE; else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_SOFTWARE")) @@ -369,6 +389,7 @@ int main(int argc, char **argv) HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF; hwconfig.rx_filter = (so_timestamping_flags & SOF_TIMESTAMPING_RX_HARDWARE) ? + ptpv2 ? HWTSTAMP_FILTER_PTP_V2_L4_SYNC : HWTSTAMP_FILTER_PTP_V1_L4_SYNC : HWTSTAMP_FILTER_NONE; hwconfig_requested = hwconfig; if (ioctl(sock, SIOCSHWTSTAMP, &hwtstamp) < 0) { @@ -496,16 +517,16 @@ int main(int argc, char **argv) printf("has error\n"); recvpacket(sock, 0, siocgstamp, - siocgstampns); + siocgstampns, ptpv2); recvpacket(sock, MSG_ERRQUEUE, siocgstamp, - siocgstampns); + siocgstampns, ptpv2); } } else { /* write one packet */ sendpacket(sock, (struct sockaddr *)&addr, - sizeof(addr)); + sizeof(addr), ptpv2); next.tv_sec += 5; continue; } diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index b599f1fa99b5..cb0d1890a860 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -103,32 +103,58 @@ FIXTURE(tls) FIXTURE_VARIANT(tls) { - unsigned int tls_version; + u16 tls_version; + u16 cipher_type; }; -FIXTURE_VARIANT_ADD(tls, 12) +FIXTURE_VARIANT_ADD(tls, 12_gcm) { .tls_version = TLS_1_2_VERSION, + .cipher_type = TLS_CIPHER_AES_GCM_128, }; -FIXTURE_VARIANT_ADD(tls, 13) +FIXTURE_VARIANT_ADD(tls, 13_gcm) { .tls_version = TLS_1_3_VERSION, + .cipher_type = TLS_CIPHER_AES_GCM_128, +}; + +FIXTURE_VARIANT_ADD(tls, 12_chacha) +{ + .tls_version = TLS_1_2_VERSION, + .cipher_type = TLS_CIPHER_CHACHA20_POLY1305, +}; + +FIXTURE_VARIANT_ADD(tls, 13_chacha) +{ + .tls_version = TLS_1_3_VERSION, + .cipher_type = TLS_CIPHER_CHACHA20_POLY1305, }; FIXTURE_SETUP(tls) { - struct tls12_crypto_info_aes_gcm_128 tls12; + union tls_crypto_context tls12; struct sockaddr_in addr; socklen_t len; int sfd, ret; + size_t tls12_sz; self->notls = false; len = sizeof(addr); memset(&tls12, 0, sizeof(tls12)); tls12.info.version = variant->tls_version; - tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; + tls12.info.cipher_type = variant->cipher_type; + switch (variant->cipher_type) { + case TLS_CIPHER_CHACHA20_POLY1305: + tls12_sz = sizeof(tls12_crypto_info_chacha20_poly1305); + break; + case TLS_CIPHER_AES_GCM_128: + tls12_sz = sizeof(tls12_crypto_info_aes_gcm_128); + break; + default: + tls12_sz = 0; + } addr.sin_family = AF_INET; addr.sin_addr.s_addr = htonl(INADDR_ANY); @@ -156,7 +182,7 @@ FIXTURE_SETUP(tls) if (!self->notls) { ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, - sizeof(tls12)); + tls12_sz); ASSERT_EQ(ret, 0); } @@ -169,7 +195,7 @@ FIXTURE_SETUP(tls) ASSERT_EQ(ret, 0); ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, - sizeof(tls12)); + tls12_sz); ASSERT_EQ(ret, 0); } diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index db3d4a8b5a4c..76a24052f4b4 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -113,6 +113,9 @@ static void do_poll(int fd, int timeout_ms) interrupted = true; break; } + + /* no events and more time to wait, do poll again */ + continue; } if (pfd.revents != POLLIN) error(1, errno, "poll: 0x%x expected 0x%x\n", diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/openat2/openat2_test.c index b386367c606b..381d874cce99 100644 --- a/tools/testing/selftests/openat2/openat2_test.c +++ b/tools/testing/selftests/openat2/openat2_test.c @@ -155,7 +155,7 @@ struct flag_test { int err; }; -#define NUM_OPENAT2_FLAG_TESTS 23 +#define NUM_OPENAT2_FLAG_TESTS 24 void test_openat2_flags(void) { @@ -210,6 +210,12 @@ void test_openat2_flags(void) .how.flags = O_TMPFILE | O_RDWR, .how.mode = 0x0000A00000000000ULL, .err = -EINVAL }, + /* ->resolve flags must not conflict. */ + { .name = "incompatible resolve flags (BENEATH | IN_ROOT)", + .how.flags = O_RDONLY, + .how.resolve = RESOLVE_BENEATH | RESOLVE_IN_ROOT, + .err = -EINVAL }, + /* ->resolve must only contain RESOLVE_* flags. */ { .name = "invalid how.resolve and O_RDONLY", .how.flags = O_RDONLY, diff --git a/tools/testing/selftests/rcutorture/bin/console-badness.sh b/tools/testing/selftests/rcutorture/bin/console-badness.sh index 0e4c0b2eb7f0..80ae7f08b363 100755 --- a/tools/testing/selftests/rcutorture/bin/console-badness.sh +++ b/tools/testing/selftests/rcutorture/bin/console-badness.sh @@ -13,4 +13,5 @@ egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' | grep -v 'ODEBUG: ' | grep -v 'This means that this is a DEBUG kernel and it is' | -grep -v 'Warning: unable to open an initial console' +grep -v 'Warning: unable to open an initial console' | +grep -v 'NOHZ tick-stop error: Non-RCU local softirq work is pending, handler' diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 51f3464b96d3..82663495fb38 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -169,6 +169,7 @@ identify_qemu () { # Output arguments for the qemu "-append" string based on CPU type # and the TORTURE_QEMU_INTERACTIVE environment variable. identify_qemu_append () { + echo debug_boot_weak_hash local console=ttyS0 case "$1" in qemu-system-x86_64|qemu-system-i386) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh index 6e65c134e5f1..370406bbfeed 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh @@ -52,8 +52,7 @@ echo Results directory: $resdir/$ds KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM PATH=${KVM}/bin:$PATH; export PATH . functions.sh -cpus="`identify_qemu_vcpus`" -echo Using up to $cpus CPUs. +echo Using all `identify_qemu_vcpus` CPUs. # Each pass through this loop does one command-line argument. for gitbr in $@ @@ -74,7 +73,7 @@ do # Test the specified commit. git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1 echo git checkout return code: $? "(Commit $ntry: $i)" - kvm.sh --cpus $cpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1 + kvm.sh --allcpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1 ret=$? echo kvm.sh return code $ret for commit $i from branch $gitbr diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh index aa745152a525..b582113178ac 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh @@ -32,7 +32,7 @@ sed -e 's/^\[[^]]*]//' < $i/console.log | awk ' /-scale: .* gps: .* batches:/ { ngps = $9; - nbatches = $11; + nbatches = 1; } /-scale: .*writer-duration/ { diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 6dc2b49b85ea..3cd03d01857c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -206,7 +206,10 @@ do kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 then - if test $kruntime -ge $seconds -o -f "$TORTURE_STOPFILE" + if test -n "$TORTURE_KCONFIG_GDB_ARG" + then + : + elif test $kruntime -ge $seconds || test -f "$TORTURE_STOPFILE" then break; fi @@ -223,6 +226,20 @@ do echo "ps -fp $killpid" >> $resdir/Warnings 2>&1 ps -fp $killpid >> $resdir/Warnings 2>&1 fi + # Reduce probability of PID reuse by allowing a one-minute buffer + if test $((kruntime + 60)) -lt $seconds && test -s "$resdir/../jitter_pids" + then + awk < "$resdir/../jitter_pids" ' + NF > 0 { + pidlist = pidlist " " $1; + n++; + } + END { + if (n > 0) { + print "kill " pidlist; + } + }' | sh + fi else echo ' ---' `date`: "Kernel done" fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 6eb1d3f6524d..45d07b7b69f5 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -58,7 +58,7 @@ usage () { echo " --datestamp string" echo " --defconfig string" echo " --dryrun sched|script" - echo " --duration minutes" + echo " --duration minutes | <seconds>s | <hours>h | <days>d" echo " --gdb" echo " --help" echo " --interactive" @@ -93,7 +93,7 @@ do TORTURE_BOOT_IMAGE="$2" shift ;; - --buildonly) + --buildonly|--build-only) TORTURE_BUILDONLY=1 ;; --configs|--config) @@ -128,8 +128,20 @@ do shift ;; --duration) - checkarg --duration "(minutes)" $# "$2" '^[0-9]*$' '^error' - dur=$(($2*60)) + checkarg --duration "(minutes)" $# "$2" '^[0-9][0-9]*\(s\|m\|h\|d\|\)$' '^error' + mult=60 + if echo "$2" | grep -q 's$' + then + mult=1 + elif echo "$2" | grep -q 'h$' + then + mult=3600 + elif echo "$2" | grep -q 'd$' + then + mult=86400 + fi + ts=`echo $2 | sed -e 's/[smhd]$//'` + dur=$(($ts*mult)) shift ;; --gdb) @@ -148,7 +160,7 @@ do jitter="$2" shift ;; - --kconfig) + --kconfig|--kconfigs) checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$' TORTURE_KCONFIG_ARG="$2" shift @@ -159,7 +171,7 @@ do --kcsan) TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_KCSAN_INTERRUPT_WATCHER=y"; export TORTURE_KCONFIG_KCSAN_ARG ;; - --kmake-arg) + --kmake-arg|--kmake-args) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="$2" shift @@ -459,8 +471,11 @@ function dump(first, pastlast, batchnum) print "if test -n \"$needqemurun\"" print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; - for (j = 0; j < njitter; j++) + print "\techo > " rd "jitter_pids" + for (j = 0; j < njitter; j++) { print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&" + print "\techo $! >> " rd "jitter_pids" + } print "\twait" print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index e03338091a06..263b1be50008 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -133,7 +133,7 @@ then then summary="$summary Warnings: $n_warn" fi - n_bugs=`egrep -c 'BUG|Oops:' $file` + n_bugs=`egrep -c '\bBUG|Oops:' $file` if test "$n_bugs" -ne 0 then summary="$summary Bugs: $n_bugs" diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t index 6c78022c8cd8..d6557c38dfe4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t @@ -4,7 +4,8 @@ CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n #CHECK#CONFIG_TINY_SRCU=y CONFIG_RCU_TRACE=n -CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_DEBUG_ATOMIC_SLEEP=y #CHECK#CONFIG_PREEMPT_COUNT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u index c15ada821e45..6bc24e99862f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u @@ -4,7 +4,6 @@ CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n #CHECK#CONFIG_TINY_SRCU=y CONFIG_RCU_TRACE=n -CONFIG_DEBUG_LOCK_ALLOC=y -CONFIG_PROVE_LOCKING=y +CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_PREEMPT_COUNT=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 index 12e7661b86f5..34c8ff5a12f2 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -4,8 +4,8 @@ CONFIG_HOTPLUG_CPU=y CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n -CONFIG_DEBUG_LOCK_ALLOC=y -CONFIG_PROVE_LOCKING=y -#CHECK#CONFIG_PROVE_RCU=y +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +#CHECK#CONFIG_PROVE_RCU=n CONFIG_TASKS_TRACE_RCU_READ_MB=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 index b69ed6673c41..77541eeb4e9f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -4,8 +4,8 @@ CONFIG_HOTPLUG_CPU=y CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y -CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_PROVE_LOCKING=n -#CHECK#CONFIG_PROVE_RCU=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +#CHECK#CONFIG_PROVE_RCU=y CONFIG_TASKS_TRACE_RCU_READ_MB=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon index 87caa0e932c7..90942bb5bebc 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon @@ -1,2 +1,5 @@ CONFIG_RCU_SCALE_TEST=y CONFIG_PRINTK_TIME=y +CONFIG_TASKS_RCU_GENERIC=y +CONFIG_TASKS_RCU=y +CONFIG_TASKS_TRACE_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 new file mode 100644 index 000000000000..e6baa2fbaeb3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 @@ -0,0 +1,15 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01.boot b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01.boot new file mode 100644 index 000000000000..af0aff1457a4 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01.boot @@ -0,0 +1 @@ +rcuscale.scale_type=tasks-tracing diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c index 384589095864..699ad5f93c34 100644 --- a/tools/testing/selftests/rseq/param_test.c +++ b/tools/testing/selftests/rseq/param_test.c @@ -1133,6 +1133,8 @@ static int set_signal_handler(void) return ret; } +/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */ +#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV struct test_membarrier_thread_args { int stop; intptr_t percpu_list_ptr; @@ -1286,8 +1288,6 @@ void *test_membarrier_manager_thread(void *arg) return NULL; } -/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */ -#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV void test_membarrier(void) { const int num_threads = opt_threads; diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh index 609a4ef9300e..97165a83df63 100755 --- a/tools/testing/selftests/run_kselftest.sh +++ b/tools/testing/selftests/run_kselftest.sh @@ -48,7 +48,7 @@ while true; do -l | --list) echo "$available" exit 0 ;; - -n | --dry-run) + -d | --dry-run) dryrun="echo" shift ;; -h | --help) diff --git a/tools/testing/selftests/seccomp/config b/tools/testing/selftests/seccomp/config index 64c19d8eba79..ad431a5178fb 100644 --- a/tools/testing/selftests/seccomp/config +++ b/tools/testing/selftests/seccomp/config @@ -1,3 +1,4 @@ +CONFIG_PID_NS=y CONFIG_SECCOMP=y CONFIG_SECCOMP_FILTER=y CONFIG_USER_NS=y diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index 91f5a89cadac..fcc806585266 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -4,12 +4,16 @@ */ #define _GNU_SOURCE #include <assert.h> +#include <limits.h> +#include <stdbool.h> +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <unistd.h> #include <linux/filter.h> #include <linux/seccomp.h> +#include <sys/param.h> #include <sys/prctl.h> #include <sys/syscall.h> #include <sys/types.h> @@ -70,18 +74,74 @@ unsigned long long calibrate(void) return samples * seconds; } +bool approx(int i_one, int i_two) +{ + double one = i_one, one_bump = one * 0.01; + double two = i_two, two_bump = two * 0.01; + + one_bump = one + MAX(one_bump, 2.0); + two_bump = two + MAX(two_bump, 2.0); + + /* Equal to, or within 1% or 2 digits */ + if (one == two || + (one > two && one <= two_bump) || + (two > one && two <= one_bump)) + return true; + return false; +} + +bool le(int i_one, int i_two) +{ + if (i_one <= i_two) + return true; + return false; +} + +long compare(const char *name_one, const char *name_eval, const char *name_two, + unsigned long long one, bool (*eval)(int, int), unsigned long long two) +{ + bool good; + + printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two, + (long long)one, name_eval, (long long)two); + if (one > INT_MAX) { + printf("Miscalculation! Measurement went negative: %lld\n", (long long)one); + return 1; + } + if (two > INT_MAX) { + printf("Miscalculation! Measurement went negative: %lld\n", (long long)two); + return 1; + } + + good = eval(one, two); + printf("%s\n", good ? "✔️" : "❌"); + + return good ? 0 : 1; +} + int main(int argc, char *argv[]) { + struct sock_filter bitmap_filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog bitmap_prog = { + .len = (unsigned short)ARRAY_SIZE(bitmap_filter), + .filter = bitmap_filter, + }; struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, args[0])), BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog prog = { .len = (unsigned short)ARRAY_SIZE(filter), .filter = filter, }; - long ret; - unsigned long long samples; - unsigned long long native, filter1, filter2; + + long ret, bits; + unsigned long long samples, calc; + unsigned long long native, filter1, filter2, bitmap1, bitmap2; + unsigned long long entry, per_filter1, per_filter2; printf("Current BPF sysctl settings:\n"); system("sysctl net.core.bpf_jit_enable"); @@ -101,35 +161,82 @@ int main(int argc, char *argv[]) ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); assert(ret == 0); - /* One filter */ - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + /* One filter resulting in a bitmap */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); assert(ret == 0); - filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1); + bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1); + + /* Second filter resulting in a bitmap */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); + assert(ret == 0); - if (filter1 == native) - printf("No overhead measured!? Try running again with more samples.\n"); + bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2); - /* Two filters */ + /* Third filter, can no longer be converted to bitmap */ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); assert(ret == 0); - filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2); - - /* Calculations */ - printf("Estimated total seccomp overhead for 1 filter: %llu ns\n", - filter1 - native); + filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1); - printf("Estimated total seccomp overhead for 2 filters: %llu ns\n", - filter2 - native); + /* Fourth filter, can not be converted to bitmap because of filter 3 */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); + assert(ret == 0); - printf("Estimated seccomp per-filter overhead: %llu ns\n", - filter2 - filter1); + filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2); + + /* Estimations */ +#define ESTIMATE(fmt, var, what) do { \ + var = (what); \ + printf("Estimated " fmt ": %llu ns\n", var); \ + if (var > INT_MAX) \ + goto more_samples; \ + } while (0) + + ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc, + bitmap1 - native); + ESTIMATE("total seccomp overhead for 2 bitmapped filters", calc, + bitmap2 - native); + ESTIMATE("total seccomp overhead for 3 full filters", calc, + filter1 - native); + ESTIMATE("total seccomp overhead for 4 full filters", calc, + filter2 - native); + ESTIMATE("seccomp entry overhead", entry, + bitmap1 - native - (bitmap2 - bitmap1)); + ESTIMATE("seccomp per-filter overhead (last 2 diff)", per_filter1, + filter2 - filter1); + ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2, + (filter2 - native - entry) / 4); + + printf("Expectations:\n"); + ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1); + bits = compare("native", "≤", "1 filter", native, le, filter1); + if (bits) + goto more_samples; + + ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)", + per_filter1, approx, per_filter2); + + bits = compare("1 bitmapped", "≈", "2 bitmapped", + bitmap1 - native, approx, bitmap2 - native); + if (bits) { + printf("Skipping constant action bitmap expectations: they appear unsupported.\n"); + goto out; + } - printf("Estimated seccomp entry overhead: %llu ns\n", - filter1 - native - (filter2 - filter1)); + ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native); + ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native); + ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total", + entry + (per_filter1 * 4) + native, approx, filter2); + if (ret == 0) + goto out; +more_samples: + printf("Saw unexpected benchmark result. Try running again with more samples?\n"); +out: return 0; } diff --git a/tools/testing/selftests/seccomp/settings b/tools/testing/selftests/seccomp/settings index ba4d85f74cd6..6091b45d226b 100644 --- a/tools/testing/selftests/seccomp/settings +++ b/tools/testing/selftests/seccomp/settings @@ -1 +1 @@ -timeout=90 +timeout=120 diff --git a/tools/testing/selftests/sgx/.gitignore b/tools/testing/selftests/sgx/.gitignore new file mode 100644 index 000000000000..fbaf0bda9a92 --- /dev/null +++ b/tools/testing/selftests/sgx/.gitignore @@ -0,0 +1,2 @@ +test_sgx +test_encl.elf diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile new file mode 100644 index 000000000000..7f12d55b97f8 --- /dev/null +++ b/tools/testing/selftests/sgx/Makefile @@ -0,0 +1,57 @@ +top_srcdir = ../../../.. + +include ../lib.mk + +.PHONY: all clean + +CAN_BUILD_X86_64 := $(shell ../x86/check_cc.sh $(CC) \ + ../x86/trivial_64bit_program.c) + +ifndef OBJCOPY +OBJCOPY := $(CROSS_COMPILE)objcopy +endif + +INCLUDES := -I$(top_srcdir)/tools/include +HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC -z noexecstack +ENCL_CFLAGS := -Wall -Werror -static -nostdlib -nostartfiles -fPIC \ + -fno-stack-protector -mrdrnd $(INCLUDES) + +TEST_CUSTOM_PROGS := $(OUTPUT)/test_sgx + +ifeq ($(CAN_BUILD_X86_64), 1) +all: $(TEST_CUSTOM_PROGS) $(OUTPUT)/test_encl.elf +endif + +$(OUTPUT)/test_sgx: $(OUTPUT)/main.o \ + $(OUTPUT)/load.o \ + $(OUTPUT)/sigstruct.o \ + $(OUTPUT)/call.o \ + $(OUTPUT)/sign_key.o + $(CC) $(HOST_CFLAGS) -o $@ $^ -lcrypto + +$(OUTPUT)/main.o: main.c + $(CC) $(HOST_CFLAGS) -c $< -o $@ + +$(OUTPUT)/load.o: load.c + $(CC) $(HOST_CFLAGS) -c $< -o $@ + +$(OUTPUT)/sigstruct.o: sigstruct.c + $(CC) $(HOST_CFLAGS) -c $< -o $@ + +$(OUTPUT)/call.o: call.S + $(CC) $(HOST_CFLAGS) -c $< -o $@ + +$(OUTPUT)/sign_key.o: sign_key.S + $(CC) $(HOST_CFLAGS) -c $< -o $@ + +$(OUTPUT)/test_encl.elf: test_encl.lds test_encl.c test_encl_bootstrap.S + $(CC) $(ENCL_CFLAGS) -T $^ -o $@ + +EXTRA_CLEAN := \ + $(OUTPUT)/test_encl.elf \ + $(OUTPUT)/load.o \ + $(OUTPUT)/call.o \ + $(OUTPUT)/main.o \ + $(OUTPUT)/sigstruct.o \ + $(OUTPUT)/test_sgx \ + $(OUTPUT)/test_sgx.o \ diff --git a/tools/testing/selftests/sgx/call.S b/tools/testing/selftests/sgx/call.S new file mode 100644 index 000000000000..4ecadc7490f4 --- /dev/null +++ b/tools/testing/selftests/sgx/call.S @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** +* Copyright(c) 2016-20 Intel Corporation. +*/ + + .text + + .global sgx_call_vdso +sgx_call_vdso: + .cfi_startproc + push %r15 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r15, 0 + push %r14 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r14, 0 + push %r13 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r13, 0 + push %r12 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r12, 0 + push %rbx + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rbx, 0 + push $0 + .cfi_adjust_cfa_offset 8 + push 0x38(%rsp) + .cfi_adjust_cfa_offset 8 + call *eenter(%rip) + add $0x10, %rsp + .cfi_adjust_cfa_offset -0x10 + pop %rbx + .cfi_adjust_cfa_offset -8 + pop %r12 + .cfi_adjust_cfa_offset -8 + pop %r13 + .cfi_adjust_cfa_offset -8 + pop %r14 + .cfi_adjust_cfa_offset -8 + pop %r15 + .cfi_adjust_cfa_offset -8 + ret + .cfi_endproc diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h new file mode 100644 index 000000000000..592c1ccf4576 --- /dev/null +++ b/tools/testing/selftests/sgx/defines.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2016-20 Intel Corporation. + */ + +#ifndef DEFINES_H +#define DEFINES_H + +#include <stdint.h> + +#define PAGE_SIZE 4096 +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +#define __aligned(x) __attribute__((__aligned__(x))) +#define __packed __attribute__((packed)) + +#include "../../../../arch/x86/kernel/cpu/sgx/arch.h" +#include "../../../../arch/x86/include/asm/enclu.h" +#include "../../../../arch/x86/include/uapi/asm/sgx.h" + +#endif /* DEFINES_H */ diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c new file mode 100644 index 000000000000..9d43b75aaa55 --- /dev/null +++ b/tools/testing/selftests/sgx/load.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <assert.h> +#include <elf.h> +#include <errno.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include "defines.h" +#include "main.h" + +void encl_delete(struct encl *encl) +{ + if (encl->encl_base) + munmap((void *)encl->encl_base, encl->encl_size); + + if (encl->bin) + munmap(encl->bin, encl->bin_size); + + if (encl->fd) + close(encl->fd); + + if (encl->segment_tbl) + free(encl->segment_tbl); + + memset(encl, 0, sizeof(*encl)); +} + +static bool encl_map_bin(const char *path, struct encl *encl) +{ + struct stat sb; + void *bin; + int ret; + int fd; + + fd = open(path, O_RDONLY); + if (fd == -1) { + perror("open()"); + return false; + } + + ret = stat(path, &sb); + if (ret) { + perror("stat()"); + goto err; + } + + bin = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (bin == MAP_FAILED) { + perror("mmap()"); + goto err; + } + + encl->bin = bin; + encl->bin_size = sb.st_size; + + close(fd); + return true; + +err: + close(fd); + return false; +} + +static bool encl_ioc_create(struct encl *encl) +{ + struct sgx_secs *secs = &encl->secs; + struct sgx_enclave_create ioc; + int rc; + + assert(encl->encl_base != 0); + + memset(secs, 0, sizeof(*secs)); + secs->ssa_frame_size = 1; + secs->attributes = SGX_ATTR_MODE64BIT; + secs->xfrm = 3; + secs->base = encl->encl_base; + secs->size = encl->encl_size; + + ioc.src = (unsigned long)secs; + rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_CREATE, &ioc); + if (rc) { + fprintf(stderr, "SGX_IOC_ENCLAVE_CREATE failed: errno=%d\n", + errno); + munmap((void *)secs->base, encl->encl_size); + return false; + } + + return true; +} + +static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) +{ + struct sgx_enclave_add_pages ioc; + struct sgx_secinfo secinfo; + int rc; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = seg->flags; + + ioc.src = (uint64_t)encl->src + seg->offset; + ioc.offset = seg->offset; + ioc.length = seg->size; + ioc.secinfo = (unsigned long)&secinfo; + ioc.flags = SGX_PAGE_MEASURE; + + rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_ADD_PAGES, &ioc); + if (rc < 0) { + fprintf(stderr, "SGX_IOC_ENCLAVE_ADD_PAGES failed: errno=%d.\n", + errno); + return false; + } + + return true; +} + +bool encl_load(const char *path, struct encl *encl) +{ + Elf64_Phdr *phdr_tbl; + off_t src_offset; + Elf64_Ehdr *ehdr; + int i, j; + int ret; + + memset(encl, 0, sizeof(*encl)); + + ret = open("/dev/sgx_enclave", O_RDWR); + if (ret < 0) { + fprintf(stderr, "Unable to open /dev/sgx_enclave\n"); + goto err; + } + + encl->fd = ret; + + if (!encl_map_bin(path, encl)) + goto err; + + ehdr = encl->bin; + phdr_tbl = encl->bin + ehdr->e_phoff; + + for (i = 0; i < ehdr->e_phnum; i++) { + Elf64_Phdr *phdr = &phdr_tbl[i]; + + if (phdr->p_type == PT_LOAD) + encl->nr_segments++; + } + + encl->segment_tbl = calloc(encl->nr_segments, + sizeof(struct encl_segment)); + if (!encl->segment_tbl) + goto err; + + for (i = 0, j = 0; i < ehdr->e_phnum; i++) { + Elf64_Phdr *phdr = &phdr_tbl[i]; + unsigned int flags = phdr->p_flags; + struct encl_segment *seg; + + if (phdr->p_type != PT_LOAD) + continue; + + seg = &encl->segment_tbl[j]; + + if (!!(flags & ~(PF_R | PF_W | PF_X))) { + fprintf(stderr, + "%d has invalid segment flags 0x%02x.\n", i, + phdr->p_flags); + goto err; + } + + if (j == 0 && flags != (PF_R | PF_W)) { + fprintf(stderr, + "TCS has invalid segment flags 0x%02x.\n", + phdr->p_flags); + goto err; + } + + if (j == 0) { + src_offset = phdr->p_offset & PAGE_MASK; + + seg->prot = PROT_READ | PROT_WRITE; + seg->flags = SGX_PAGE_TYPE_TCS << 8; + } else { + seg->prot = (phdr->p_flags & PF_R) ? PROT_READ : 0; + seg->prot |= (phdr->p_flags & PF_W) ? PROT_WRITE : 0; + seg->prot |= (phdr->p_flags & PF_X) ? PROT_EXEC : 0; + seg->flags = (SGX_PAGE_TYPE_REG << 8) | seg->prot; + } + + seg->offset = (phdr->p_offset & PAGE_MASK) - src_offset; + seg->size = (phdr->p_filesz + PAGE_SIZE - 1) & PAGE_MASK; + + printf("0x%016lx 0x%016lx 0x%02x\n", seg->offset, seg->size, + seg->prot); + + j++; + } + + assert(j == encl->nr_segments); + + encl->src = encl->bin + src_offset; + encl->src_size = encl->segment_tbl[j - 1].offset + + encl->segment_tbl[j - 1].size; + + for (encl->encl_size = 4096; encl->encl_size < encl->src_size; ) + encl->encl_size <<= 1; + + return true; + +err: + encl_delete(encl); + return false; +} + +static bool encl_map_area(struct encl *encl) +{ + size_t encl_size = encl->encl_size; + void *area; + + area = mmap(NULL, encl_size * 2, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (area == MAP_FAILED) { + perror("mmap"); + return false; + } + + encl->encl_base = ((uint64_t)area + encl_size - 1) & ~(encl_size - 1); + + munmap(area, encl->encl_base - (uint64_t)area); + munmap((void *)(encl->encl_base + encl_size), + (uint64_t)area + encl_size - encl->encl_base); + + return true; +} + +bool encl_build(struct encl *encl) +{ + struct sgx_enclave_init ioc; + int ret; + int i; + + if (!encl_map_area(encl)) + return false; + + if (!encl_ioc_create(encl)) + return false; + + /* + * Pages must be added before mapping VMAs because their permissions + * cap the VMA permissions. + */ + for (i = 0; i < encl->nr_segments; i++) { + struct encl_segment *seg = &encl->segment_tbl[i]; + + if (!encl_ioc_add_pages(encl, seg)) + return false; + } + + ioc.sigstruct = (uint64_t)&encl->sigstruct; + ret = ioctl(encl->fd, SGX_IOC_ENCLAVE_INIT, &ioc); + if (ret) { + fprintf(stderr, "SGX_IOC_ENCLAVE_INIT failed: errno=%d\n", + errno); + return false; + } + + return true; +} diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c new file mode 100644 index 000000000000..724cec700926 --- /dev/null +++ b/tools/testing/selftests/sgx/main.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <elf.h> +#include <errno.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include "defines.h" +#include "main.h" +#include "../kselftest.h" + +static const uint64_t MAGIC = 0x1122334455667788ULL; +vdso_sgx_enter_enclave_t eenter; + +struct vdso_symtab { + Elf64_Sym *elf_symtab; + const char *elf_symstrtab; + Elf64_Word *elf_hashtab; +}; + +static void *vdso_get_base_addr(char *envp[]) +{ + Elf64_auxv_t *auxv; + int i; + + for (i = 0; envp[i]; i++) + ; + + auxv = (Elf64_auxv_t *)&envp[i + 1]; + + for (i = 0; auxv[i].a_type != AT_NULL; i++) { + if (auxv[i].a_type == AT_SYSINFO_EHDR) + return (void *)auxv[i].a_un.a_val; + } + + return NULL; +} + +static Elf64_Dyn *vdso_get_dyntab(void *addr) +{ + Elf64_Ehdr *ehdr = addr; + Elf64_Phdr *phdrtab = addr + ehdr->e_phoff; + int i; + + for (i = 0; i < ehdr->e_phnum; i++) + if (phdrtab[i].p_type == PT_DYNAMIC) + return addr + phdrtab[i].p_offset; + + return NULL; +} + +static void *vdso_get_dyn(void *addr, Elf64_Dyn *dyntab, Elf64_Sxword tag) +{ + int i; + + for (i = 0; dyntab[i].d_tag != DT_NULL; i++) + if (dyntab[i].d_tag == tag) + return addr + dyntab[i].d_un.d_ptr; + + return NULL; +} + +static bool vdso_get_symtab(void *addr, struct vdso_symtab *symtab) +{ + Elf64_Dyn *dyntab = vdso_get_dyntab(addr); + + symtab->elf_symtab = vdso_get_dyn(addr, dyntab, DT_SYMTAB); + if (!symtab->elf_symtab) + return false; + + symtab->elf_symstrtab = vdso_get_dyn(addr, dyntab, DT_STRTAB); + if (!symtab->elf_symstrtab) + return false; + + symtab->elf_hashtab = vdso_get_dyn(addr, dyntab, DT_HASH); + if (!symtab->elf_hashtab) + return false; + + return true; +} + +static unsigned long elf_sym_hash(const char *name) +{ + unsigned long h = 0, high; + + while (*name) { + h = (h << 4) + *name++; + high = h & 0xf0000000; + + if (high) + h ^= high >> 24; + + h &= ~high; + } + + return h; +} + +static Elf64_Sym *vdso_symtab_get(struct vdso_symtab *symtab, const char *name) +{ + Elf64_Word bucketnum = symtab->elf_hashtab[0]; + Elf64_Word *buckettab = &symtab->elf_hashtab[2]; + Elf64_Word *chaintab = &symtab->elf_hashtab[2 + bucketnum]; + Elf64_Sym *sym; + Elf64_Word i; + + for (i = buckettab[elf_sym_hash(name) % bucketnum]; i != STN_UNDEF; + i = chaintab[i]) { + sym = &symtab->elf_symtab[i]; + if (!strcmp(name, &symtab->elf_symstrtab[sym->st_name])) + return sym; + } + + return NULL; +} + +bool report_results(struct sgx_enclave_run *run, int ret, uint64_t result, + const char *test) +{ + bool valid = true; + + if (ret) { + printf("FAIL: %s() returned: %d\n", test, ret); + valid = false; + } + + if (run->function != EEXIT) { + printf("FAIL: %s() function, expected: %u, got: %u\n", test, EEXIT, + run->function); + valid = false; + } + + if (result != MAGIC) { + printf("FAIL: %s(), expected: 0x%lx, got: 0x%lx\n", test, MAGIC, + result); + valid = false; + } + + if (run->user_data) { + printf("FAIL: %s() user data, expected: 0x0, got: 0x%llx\n", + test, run->user_data); + valid = false; + } + + return valid; +} + +static int user_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r9, + struct sgx_enclave_run *run) +{ + run->user_data = 0; + return 0; +} + +int main(int argc, char *argv[], char *envp[]) +{ + struct sgx_enclave_run run; + struct vdso_symtab symtab; + Elf64_Sym *eenter_sym; + uint64_t result = 0; + struct encl encl; + unsigned int i; + void *addr; + int ret; + + memset(&run, 0, sizeof(run)); + + if (!encl_load("test_encl.elf", &encl)) { + encl_delete(&encl); + ksft_exit_skip("cannot load enclaves\n"); + } + + if (!encl_measure(&encl)) + goto err; + + if (!encl_build(&encl)) + goto err; + + /* + * An enclave consumer only must do this. + */ + for (i = 0; i < encl.nr_segments; i++) { + struct encl_segment *seg = &encl.segment_tbl[i]; + + addr = mmap((void *)encl.encl_base + seg->offset, seg->size, + seg->prot, MAP_SHARED | MAP_FIXED, encl.fd, 0); + if (addr == MAP_FAILED) { + fprintf(stderr, "mmap() failed, errno=%d.\n", errno); + exit(KSFT_FAIL); + } + } + + memset(&run, 0, sizeof(run)); + run.tcs = encl.encl_base; + + addr = vdso_get_base_addr(envp); + if (!addr) + goto err; + + if (!vdso_get_symtab(addr, &symtab)) + goto err; + + eenter_sym = vdso_symtab_get(&symtab, "__vdso_sgx_enter_enclave"); + if (!eenter_sym) + goto err; + + eenter = addr + eenter_sym->st_value; + + ret = sgx_call_vdso((void *)&MAGIC, &result, 0, EENTER, NULL, NULL, &run); + if (!report_results(&run, ret, result, "sgx_call_vdso")) + goto err; + + + /* Invoke the vDSO directly. */ + result = 0; + ret = eenter((unsigned long)&MAGIC, (unsigned long)&result, 0, EENTER, + 0, 0, &run); + if (!report_results(&run, ret, result, "eenter")) + goto err; + + /* And with an exit handler. */ + run.user_handler = (__u64)user_handler; + run.user_data = 0xdeadbeef; + ret = eenter((unsigned long)&MAGIC, (unsigned long)&result, 0, EENTER, + 0, 0, &run); + if (!report_results(&run, ret, result, "user_handler")) + goto err; + + printf("SUCCESS\n"); + encl_delete(&encl); + exit(KSFT_PASS); + +err: + encl_delete(&encl); + exit(KSFT_FAIL); +} diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h new file mode 100644 index 000000000000..67211a708f04 --- /dev/null +++ b/tools/testing/selftests/sgx/main.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2016-20 Intel Corporation. + */ + +#ifndef MAIN_H +#define MAIN_H + +struct encl_segment { + off_t offset; + size_t size; + unsigned int prot; + unsigned int flags; +}; + +struct encl { + int fd; + void *bin; + off_t bin_size; + void *src; + size_t src_size; + size_t encl_size; + off_t encl_base; + unsigned int nr_segments; + struct encl_segment *segment_tbl; + struct sgx_secs secs; + struct sgx_sigstruct sigstruct; +}; + +extern unsigned char sign_key[]; +extern unsigned char sign_key_end[]; + +void encl_delete(struct encl *ctx); +bool encl_load(const char *path, struct encl *encl); +bool encl_measure(struct encl *encl); +bool encl_build(struct encl *encl); + +int sgx_call_vdso(void *rdi, void *rsi, long rdx, u32 function, void *r8, void *r9, + struct sgx_enclave_run *run); + +#endif /* MAIN_H */ diff --git a/tools/testing/selftests/sgx/sign_key.S b/tools/testing/selftests/sgx/sign_key.S new file mode 100644 index 000000000000..e4fbe948444a --- /dev/null +++ b/tools/testing/selftests/sgx/sign_key.S @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** +* Copyright(c) 2016-20 Intel Corporation. +*/ + + .section ".rodata", "a" + +sign_key: + .globl sign_key + .incbin "sign_key.pem" +sign_key_end: + .globl sign_key_end diff --git a/tools/testing/selftests/sgx/sign_key.pem b/tools/testing/selftests/sgx/sign_key.pem new file mode 100644 index 000000000000..d76f21f19187 --- /dev/null +++ b/tools/testing/selftests/sgx/sign_key.pem @@ -0,0 +1,39 @@ +-----BEGIN RSA PRIVATE KEY----- +MIIG4wIBAAKCAYEApalGbq7Q+usM91CPtksu3D+b0Prc8gAFL6grM3mg85A5Bx8V +cfMXPgtrw8EYFwQxDAvzZWwl+9VfOX0ECrFRBkOHcOiG0SnADN8+FLj1UiNUQwbp +S6OzhNWuRcSbGraSOyUlVlV0yMQSvewyzGklOaXBe30AJqzIBc8QfdSxKuP8rs0Z +ga6k/Bl73osrYKByILJTUUeZqjLERsE6GebsdzbWgKn8qVqng4ZS4yMNg6LeRlH3 ++9CIPgg4jwpSLHcp7dq2qTIB9a0tGe9ayp+5FbucpB6U7ePold0EeRN6RlJGDF9k +L93v8P5ykz5G5gYZ2g0K1X2sHIWV4huxPgv5PXgdyQYbK+6olqj0d5rjYuwX57Ul +k6SroPS1U6UbdCjG5txM+BNGU0VpD0ZhrIRw0leQdnNcCO9sTJuInZrgYacSVJ7u +mtB+uCt+uzUesc+l+xPRYA+9e14lLkZp7AAmo9FvL816XDI09deehJ3i/LmHKCRN +tuqC5TprRjFwUr6dAgEDAoIBgG5w2Z8fNfycs0+LCnmHdJLVEotR6KFVWMpwHMz7 +wKJgJgS/Y6FMuilc8oKAuroCy11dTO5IGVKOP3uorVx2NgQtBPXwWeDGgAiU1A3Q +o4wXjYIEm4fCd63jyYPYZ2ckYXzDbjmOTdstYdPyzIhGGNEZK6eoqsRzMAPfYFPj +IMdCqHSIu6vJw1K7p+myHOsVoWshjODaZnF3LYSA0WaZ8vokjwBxUxuRxQJZjJds +s60XPtmL+qfgWtQFewoG4XL6GuD8FcXccynRRtzrLtFNPIl9BQfWfjBBhTC1/Te1 +0Z6XbZvpdUTD9OfLB7SbR2OUFNpKQgriO0iYVdbW3cr7uu38Zwp4W1TX73DPjoi6 +KNooP6SGWd4mRJW2+dUmSYS4QNG8eVVZswKcploEIXlAKRsOe4kzJJ1iETugIe85 +uX8nd1WYEp65xwoRUg8hqng0MeyveVbXqNKuJG6tzNDt9kgFYo+hmC/oouAW2Dtc +T9jdRAwKJXqA2Eg6OkgXCEv+kwKBwQDYaQiFMlFhsmLlqI+EzCUh7c941/cL7m6U +7j98+8ngl0HgCEcrc10iJVCKakQW3YbPzAx3XkKTaGjWazvvrFarXIGlOud64B8a +iWyQ7VdlnmZnNEdk+C83tI91OQeaTKqRLDGzKh29Ry/jL8Pcbazt+kDgxa0H7qJp +roADUanLQuNkYubpbhFBh3xpa2EExaVq6rF7nIVsD8W9TrbmPKA4LgH7z0iy544D +kVCNYsTjYDdUWP+WiSor8kCnnpjnN9sCgcEAw/eNezUD1UDf6OYFC9+5JZJFn4Tg +mZMyN93JKIb199ffwnjtHUSjcyiWeesXucpzwtGbTcwQnDisSW4oneYKLSEBlBaq +scqiUugyGZZOthFSCbdXYXMViK2vHrKlkse7GxVlROKcEhM/pRBrmjaGO8eWR+D4 +FO2wCXzVs3KgV6j779frw0vC54oHOxc9+Lu1rSHp4i+600koyvL/zF6U/5tZXIvN +YW2yoiQJnjCmVA1pwbwV6KAUTPDTMnBK+YjnAoHBAJBGBa4hi5Z27JkbCliIGMFJ +NPs6pLKe9GNJf6in2+sPgUAFhMeiPhbDiwbxgrnpBIqICE+ULGJFmzmc0p/IOceT +ARjR76dAFLxbnbXzj5kURETNhO36yiUjCk4mBRGIcbYddndxaSjaH+zKgpLzyJ6m +1esuc1qfFvEfAAI2cTIsl5hB70ZJYNZaUvDyQK3ZGPHxy6e9rkgKg9OJz0QoatAe +q/002yHvtAJg4F5B2JeVejg7VQ8GHB1MKxppu0TP5wKBwQCCpQj8zgKOKz/wmViy +lSYZDC5qWJW7t3bP6TDFr06lOpUsUJ4TgxeiGw778g/RMaKB4RIz3WBoJcgw9BsT +7rFza1ZiucchMcGMmswRDt8kC4wGejpA92Owc8oUdxkMhSdnY5jYlxK2t3/DYEe8 +JFl9L7mFQKVjSSAGUzkiTGrlG1Kf5UfXh9dFBq98uilQfSPIwUaWynyM23CHTKqI +Pw3/vOY9sojrnncWwrEUIG7is5vWfWPwargzSzd29YdRBe8CgcEAuRVewK/YeNOX +B7ZG6gKKsfsvrGtY7FPETzLZAHjoVXYNea4LVZ2kn4hBXXlvw/4HD+YqcTt4wmif +5JQlDvjNobUiKJZpzy7hklVhF7wZFl4pCF7Yh43q9iQ7gKTaeUG7MiaK+G8Zz8aY +HW9rsiihbdZkccMvnPfO9334XMxl3HtBRzLstjUlbLB7Sdh+7tZ3JQidCOFNs5pE +XyWwnASPu4tKfDahH1UUTp1uJcq/6716CSWg080avYxFcn75qqsb +-----END RSA PRIVATE KEY----- diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c new file mode 100644 index 000000000000..dee7a3d6c5a5 --- /dev/null +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#define _GNU_SOURCE +#include <assert.h> +#include <getopt.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <openssl/err.h> +#include <openssl/pem.h> +#include "defines.h" +#include "main.h" + +struct q1q2_ctx { + BN_CTX *bn_ctx; + BIGNUM *m; + BIGNUM *s; + BIGNUM *q1; + BIGNUM *qr; + BIGNUM *q2; +}; + +static void free_q1q2_ctx(struct q1q2_ctx *ctx) +{ + BN_CTX_free(ctx->bn_ctx); + BN_free(ctx->m); + BN_free(ctx->s); + BN_free(ctx->q1); + BN_free(ctx->qr); + BN_free(ctx->q2); +} + +static bool alloc_q1q2_ctx(const uint8_t *s, const uint8_t *m, + struct q1q2_ctx *ctx) +{ + ctx->bn_ctx = BN_CTX_new(); + ctx->s = BN_bin2bn(s, SGX_MODULUS_SIZE, NULL); + ctx->m = BN_bin2bn(m, SGX_MODULUS_SIZE, NULL); + ctx->q1 = BN_new(); + ctx->qr = BN_new(); + ctx->q2 = BN_new(); + + if (!ctx->bn_ctx || !ctx->s || !ctx->m || !ctx->q1 || !ctx->qr || + !ctx->q2) { + free_q1q2_ctx(ctx); + return false; + } + + return true; +} + +static bool calc_q1q2(const uint8_t *s, const uint8_t *m, uint8_t *q1, + uint8_t *q2) +{ + struct q1q2_ctx ctx; + + if (!alloc_q1q2_ctx(s, m, &ctx)) { + fprintf(stderr, "Not enough memory for Q1Q2 calculation\n"); + return false; + } + + if (!BN_mul(ctx.q1, ctx.s, ctx.s, ctx.bn_ctx)) + goto out; + + if (!BN_div(ctx.q1, ctx.qr, ctx.q1, ctx.m, ctx.bn_ctx)) + goto out; + + if (BN_num_bytes(ctx.q1) > SGX_MODULUS_SIZE) { + fprintf(stderr, "Too large Q1 %d bytes\n", + BN_num_bytes(ctx.q1)); + goto out; + } + + if (!BN_mul(ctx.q2, ctx.s, ctx.qr, ctx.bn_ctx)) + goto out; + + if (!BN_div(ctx.q2, NULL, ctx.q2, ctx.m, ctx.bn_ctx)) + goto out; + + if (BN_num_bytes(ctx.q2) > SGX_MODULUS_SIZE) { + fprintf(stderr, "Too large Q2 %d bytes\n", + BN_num_bytes(ctx.q2)); + goto out; + } + + BN_bn2bin(ctx.q1, q1); + BN_bn2bin(ctx.q2, q2); + + free_q1q2_ctx(&ctx); + return true; +out: + free_q1q2_ctx(&ctx); + return false; +} + +struct sgx_sigstruct_payload { + struct sgx_sigstruct_header header; + struct sgx_sigstruct_body body; +}; + +static bool check_crypto_errors(void) +{ + int err; + bool had_errors = false; + const char *filename; + int line; + char str[256]; + + for ( ; ; ) { + if (ERR_peek_error() == 0) + break; + + had_errors = true; + err = ERR_get_error_line(&filename, &line); + ERR_error_string_n(err, str, sizeof(str)); + fprintf(stderr, "crypto: %s: %s:%d\n", str, filename, line); + } + + return had_errors; +} + +static inline const BIGNUM *get_modulus(RSA *key) +{ + const BIGNUM *n; + + RSA_get0_key(key, &n, NULL, NULL); + return n; +} + +static RSA *gen_sign_key(void) +{ + unsigned long sign_key_length; + BIO *bio; + RSA *key; + + sign_key_length = (unsigned long)&sign_key_end - + (unsigned long)&sign_key; + + bio = BIO_new_mem_buf(&sign_key, sign_key_length); + if (!bio) + return NULL; + + key = PEM_read_bio_RSAPrivateKey(bio, NULL, NULL, NULL); + BIO_free(bio); + + return key; +} + +static void reverse_bytes(void *data, int length) +{ + int i = 0; + int j = length - 1; + uint8_t temp; + uint8_t *ptr = data; + + while (i < j) { + temp = ptr[i]; + ptr[i] = ptr[j]; + ptr[j] = temp; + i++; + j--; + } +} + +enum mrtags { + MRECREATE = 0x0045544145524345, + MREADD = 0x0000000044444145, + MREEXTEND = 0x00444E4554584545, +}; + +static bool mrenclave_update(EVP_MD_CTX *ctx, const void *data) +{ + if (!EVP_DigestUpdate(ctx, data, 64)) { + fprintf(stderr, "digest update failed\n"); + return false; + } + + return true; +} + +static bool mrenclave_commit(EVP_MD_CTX *ctx, uint8_t *mrenclave) +{ + unsigned int size; + + if (!EVP_DigestFinal_ex(ctx, (unsigned char *)mrenclave, &size)) { + fprintf(stderr, "digest commit failed\n"); + return false; + } + + if (size != 32) { + fprintf(stderr, "invalid digest size = %u\n", size); + return false; + } + + return true; +} + +struct mrecreate { + uint64_t tag; + uint32_t ssaframesize; + uint64_t size; + uint8_t reserved[44]; +} __attribute__((__packed__)); + + +static bool mrenclave_ecreate(EVP_MD_CTX *ctx, uint64_t blob_size) +{ + struct mrecreate mrecreate; + uint64_t encl_size; + + for (encl_size = 0x1000; encl_size < blob_size; ) + encl_size <<= 1; + + memset(&mrecreate, 0, sizeof(mrecreate)); + mrecreate.tag = MRECREATE; + mrecreate.ssaframesize = 1; + mrecreate.size = encl_size; + + if (!EVP_DigestInit_ex(ctx, EVP_sha256(), NULL)) + return false; + + return mrenclave_update(ctx, &mrecreate); +} + +struct mreadd { + uint64_t tag; + uint64_t offset; + uint64_t flags; /* SECINFO flags */ + uint8_t reserved[40]; +} __attribute__((__packed__)); + +static bool mrenclave_eadd(EVP_MD_CTX *ctx, uint64_t offset, uint64_t flags) +{ + struct mreadd mreadd; + + memset(&mreadd, 0, sizeof(mreadd)); + mreadd.tag = MREADD; + mreadd.offset = offset; + mreadd.flags = flags; + + return mrenclave_update(ctx, &mreadd); +} + +struct mreextend { + uint64_t tag; + uint64_t offset; + uint8_t reserved[48]; +} __attribute__((__packed__)); + +static bool mrenclave_eextend(EVP_MD_CTX *ctx, uint64_t offset, + const uint8_t *data) +{ + struct mreextend mreextend; + int i; + + for (i = 0; i < 0x1000; i += 0x100) { + memset(&mreextend, 0, sizeof(mreextend)); + mreextend.tag = MREEXTEND; + mreextend.offset = offset + i; + + if (!mrenclave_update(ctx, &mreextend)) + return false; + + if (!mrenclave_update(ctx, &data[i + 0x00])) + return false; + + if (!mrenclave_update(ctx, &data[i + 0x40])) + return false; + + if (!mrenclave_update(ctx, &data[i + 0x80])) + return false; + + if (!mrenclave_update(ctx, &data[i + 0xC0])) + return false; + } + + return true; +} + +static bool mrenclave_segment(EVP_MD_CTX *ctx, struct encl *encl, + struct encl_segment *seg) +{ + uint64_t end = seg->offset + seg->size; + uint64_t offset; + + for (offset = seg->offset; offset < end; offset += PAGE_SIZE) { + if (!mrenclave_eadd(ctx, offset, seg->flags)) + return false; + + if (!mrenclave_eextend(ctx, offset, encl->src + offset)) + return false; + } + + return true; +} + +bool encl_measure(struct encl *encl) +{ + uint64_t header1[2] = {0x000000E100000006, 0x0000000000010000}; + uint64_t header2[2] = {0x0000006000000101, 0x0000000100000060}; + struct sgx_sigstruct *sigstruct = &encl->sigstruct; + struct sgx_sigstruct_payload payload; + uint8_t digest[SHA256_DIGEST_LENGTH]; + unsigned int siglen; + RSA *key = NULL; + EVP_MD_CTX *ctx; + int i; + + memset(sigstruct, 0, sizeof(*sigstruct)); + + sigstruct->header.header1[0] = header1[0]; + sigstruct->header.header1[1] = header1[1]; + sigstruct->header.header2[0] = header2[0]; + sigstruct->header.header2[1] = header2[1]; + sigstruct->exponent = 3; + sigstruct->body.attributes = SGX_ATTR_MODE64BIT; + sigstruct->body.xfrm = 3; + + /* sanity check */ + if (check_crypto_errors()) + goto err; + + key = gen_sign_key(); + if (!key) { + ERR_print_errors_fp(stdout); + goto err; + } + + BN_bn2bin(get_modulus(key), sigstruct->modulus); + + ctx = EVP_MD_CTX_create(); + if (!ctx) + goto err; + + if (!mrenclave_ecreate(ctx, encl->src_size)) + goto err; + + for (i = 0; i < encl->nr_segments; i++) { + struct encl_segment *seg = &encl->segment_tbl[i]; + + if (!mrenclave_segment(ctx, encl, seg)) + goto err; + } + + if (!mrenclave_commit(ctx, sigstruct->body.mrenclave)) + goto err; + + memcpy(&payload.header, &sigstruct->header, sizeof(sigstruct->header)); + memcpy(&payload.body, &sigstruct->body, sizeof(sigstruct->body)); + + SHA256((unsigned char *)&payload, sizeof(payload), digest); + + if (!RSA_sign(NID_sha256, digest, SHA256_DIGEST_LENGTH, + sigstruct->signature, &siglen, key)) + goto err; + + if (!calc_q1q2(sigstruct->signature, sigstruct->modulus, sigstruct->q1, + sigstruct->q2)) + goto err; + + /* BE -> LE */ + reverse_bytes(sigstruct->signature, SGX_MODULUS_SIZE); + reverse_bytes(sigstruct->modulus, SGX_MODULUS_SIZE); + reverse_bytes(sigstruct->q1, SGX_MODULUS_SIZE); + reverse_bytes(sigstruct->q2, SGX_MODULUS_SIZE); + + EVP_MD_CTX_destroy(ctx); + RSA_free(key); + return true; + +err: + EVP_MD_CTX_destroy(ctx); + RSA_free(key); + return false; +} diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c new file mode 100644 index 000000000000..cf25b5dc1e03 --- /dev/null +++ b/tools/testing/selftests/sgx/test_encl.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <stddef.h> +#include "defines.h" + +static void *memcpy(void *dest, const void *src, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + ((char *)dest)[i] = ((char *)src)[i]; + + return dest; +} + +void encl_body(void *rdi, void *rsi) +{ + memcpy(rsi, rdi, 8); +} diff --git a/tools/testing/selftests/sgx/test_encl.lds b/tools/testing/selftests/sgx/test_encl.lds new file mode 100644 index 000000000000..0fbbda7e665e --- /dev/null +++ b/tools/testing/selftests/sgx/test_encl.lds @@ -0,0 +1,40 @@ +OUTPUT_FORMAT(elf64-x86-64) + +PHDRS +{ + tcs PT_LOAD; + text PT_LOAD; + data PT_LOAD; +} + +SECTIONS +{ + . = 0; + .tcs : { + *(.tcs*) + } : tcs + + . = ALIGN(4096); + .text : { + *(.text*) + *(.rodata*) + } : text + + . = ALIGN(4096); + .data : { + *(.data*) + } : data + + /DISCARD/ : { + *(.comment*) + *(.note*) + *(.debug*) + *(.eh_frame*) + } +} + +ASSERT(!DEFINED(.altinstructions), "ALTERNATIVES are not supported in enclaves") +ASSERT(!DEFINED(.altinstr_replacement), "ALTERNATIVES are not supported in enclaves") +ASSERT(!DEFINED(.discard.retpoline_safe), "RETPOLINE ALTERNATIVES are not supported in enclaves") +ASSERT(!DEFINED(.discard.nospec), "RETPOLINE ALTERNATIVES are not supported in enclaves") +ASSERT(!DEFINED(.got.plt), "Libcalls are not supported in enclaves") diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S new file mode 100644 index 000000000000..5d5680d4ea39 --- /dev/null +++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2016-20 Intel Corporation. + */ + + .macro ENCLU + .byte 0x0f, 0x01, 0xd7 + .endm + + .section ".tcs", "aw" + .balign 4096 + + .fill 1, 8, 0 # STATE (set by CPU) + .fill 1, 8, 0 # FLAGS + .quad encl_ssa # OSSA + .fill 1, 4, 0 # CSSA (set by CPU) + .fill 1, 4, 1 # NSSA + .quad encl_entry # OENTRY + .fill 1, 8, 0 # AEP (set by EENTER and ERESUME) + .fill 1, 8, 0 # OFSBASE + .fill 1, 8, 0 # OGSBASE + .fill 1, 4, 0xFFFFFFFF # FSLIMIT + .fill 1, 4, 0xFFFFFFFF # GSLIMIT + .fill 4024, 1, 0 # Reserved + + # Identical to the previous TCS. + .fill 1, 8, 0 # STATE (set by CPU) + .fill 1, 8, 0 # FLAGS + .quad encl_ssa # OSSA + .fill 1, 4, 0 # CSSA (set by CPU) + .fill 1, 4, 1 # NSSA + .quad encl_entry # OENTRY + .fill 1, 8, 0 # AEP (set by EENTER and ERESUME) + .fill 1, 8, 0 # OFSBASE + .fill 1, 8, 0 # OGSBASE + .fill 1, 4, 0xFFFFFFFF # FSLIMIT + .fill 1, 4, 0xFFFFFFFF # GSLIMIT + .fill 4024, 1, 0 # Reserved + + .text + +encl_entry: + # RBX contains the base address for TCS, which is also the first address + # inside the enclave. By adding the value of le_stack_end to it, we get + # the absolute address for the stack. + lea (encl_stack)(%rbx), %rax + xchg %rsp, %rax + push %rax + + push %rcx # push the address after EENTER + push %rbx # push the enclave base address + + call encl_body + + pop %rbx # pop the enclave base address + + /* Clear volatile GPRs, except RAX (EEXIT function). */ + xor %rcx, %rcx + xor %rdx, %rdx + xor %rdi, %rdi + xor %rsi, %rsi + xor %r8, %r8 + xor %r9, %r9 + xor %r10, %r10 + xor %r11, %r11 + + # Reset status flags. + add %rdx, %rdx # OF = SF = AF = CF = 0; ZF = PF = 1 + + # Prepare EEXIT target by popping the address of the instruction after + # EENTER to RBX. + pop %rbx + + # Restore the caller stack. + pop %rax + mov %rax, %rsp + + # EEXIT + mov $4, %rax + enclu + + .section ".data", "aw" + +encl_ssa: + .space 4096 + + .balign 4096 + .space 8192 +encl_stack: diff --git a/tools/testing/selftests/android/ion/.gitignore b/tools/testing/selftests/syscall_user_dispatch/.gitignore index 78eae9972bb1..f539615ad5da 100644 --- a/tools/testing/selftests/android/ion/.gitignore +++ b/tools/testing/selftests/syscall_user_dispatch/.gitignore @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -ionapp_export -ionapp_import -ionmap_test +sud_test +sud_benchmark diff --git a/tools/testing/selftests/syscall_user_dispatch/Makefile b/tools/testing/selftests/syscall_user_dispatch/Makefile new file mode 100644 index 000000000000..03c120270953 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +top_srcdir = ../../../.. +INSTALL_HDR_PATH = $(top_srcdir)/usr +LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ + +CFLAGS += -Wall -I$(LINUX_HDR_PATH) + +TEST_GEN_PROGS := sud_test sud_benchmark +include ../lib.mk diff --git a/tools/testing/selftests/syscall_user_dispatch/config b/tools/testing/selftests/syscall_user_dispatch/config new file mode 100644 index 000000000000..039e303e59d7 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/config @@ -0,0 +1 @@ +CONFIG_GENERIC_ENTRY=y diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c b/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c new file mode 100644 index 000000000000..6689f1183dbf --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Collabora Ltd. + * + * Benchmark and test syscall user dispatch + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <signal.h> +#include <errno.h> +#include <time.h> +#include <sys/time.h> +#include <unistd.h> +#include <sys/sysinfo.h> +#include <sys/prctl.h> +#include <sys/syscall.h> + +#ifndef PR_SET_SYSCALL_USER_DISPATCH +# define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +#endif + +#ifdef __NR_syscalls +# define MAGIC_SYSCALL_1 (__NR_syscalls + 1) /* Bad Linux syscall number */ +#else +# define MAGIC_SYSCALL_1 (0xff00) /* Bad Linux syscall number */ +#endif + +/* + * To test returning from a sigsys with selector blocked, the test + * requires some per-architecture support (i.e. knowledge about the + * signal trampoline address). On i386, we know it is on the vdso, and + * a small trampoline is open-coded for x86_64. Other architectures + * that have a trampoline in the vdso will support TEST_BLOCKED_RETURN + * out of the box, but don't enable them until they support syscall user + * dispatch. + */ +#if defined(__x86_64__) || defined(__i386__) +#define TEST_BLOCKED_RETURN +#endif + +#ifdef __x86_64__ +void* (syscall_dispatcher_start)(void); +void* (syscall_dispatcher_end)(void); +#else +unsigned long syscall_dispatcher_start = 0; +unsigned long syscall_dispatcher_end = 0; +#endif + +unsigned long trapped_call_count = 0; +unsigned long native_call_count = 0; + +char selector; +#define SYSCALL_BLOCK (selector = PR_SYS_DISPATCH_ON) +#define SYSCALL_UNBLOCK (selector = PR_SYS_DISPATCH_OFF) + +#define CALIBRATION_STEP 100000 +#define CALIBRATE_TO_SECS 5 +int factor; + +static double one_sysinfo_step(void) +{ + struct timespec t1, t2; + int i; + struct sysinfo info; + + clock_gettime(CLOCK_MONOTONIC, &t1); + for (i = 0; i < CALIBRATION_STEP; i++) + sysinfo(&info); + clock_gettime(CLOCK_MONOTONIC, &t2); + return (t2.tv_sec - t1.tv_sec) + 1.0e-9 * (t2.tv_nsec - t1.tv_nsec); +} + +static void calibrate_set(void) +{ + double elapsed = 0; + + printf("Calibrating test set to last ~%d seconds...\n", CALIBRATE_TO_SECS); + + while (elapsed < 1) { + elapsed += one_sysinfo_step(); + factor += CALIBRATE_TO_SECS; + } + + printf("test iterations = %d\n", CALIBRATION_STEP * factor); +} + +static double perf_syscall(void) +{ + unsigned int i; + double partial = 0; + + for (i = 0; i < factor; ++i) + partial += one_sysinfo_step()/(CALIBRATION_STEP*factor); + return partial; +} + +static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) +{ + char buf[1024]; + int len; + + SYSCALL_UNBLOCK; + + /* printf and friends are not signal-safe. */ + len = snprintf(buf, 1024, "Caught sys_%x\n", info->si_syscall); + write(1, buf, len); + + if (info->si_syscall == MAGIC_SYSCALL_1) + trapped_call_count++; + else + native_call_count++; + +#ifdef TEST_BLOCKED_RETURN + SYSCALL_BLOCK; +#endif + +#ifdef __x86_64__ + __asm__ volatile("movq $0xf, %rax"); + __asm__ volatile("leaveq"); + __asm__ volatile("add $0x8, %rsp"); + __asm__ volatile("syscall_dispatcher_start:"); + __asm__ volatile("syscall"); + __asm__ volatile("nop"); /* Landing pad within dispatcher area */ + __asm__ volatile("syscall_dispatcher_end:"); +#endif + +} + +int main(void) +{ + struct sigaction act; + double time1, time2; + int ret; + sigset_t mask; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + calibrate_set(); + + time1 = perf_syscall(); + printf("Avg syscall time %.0lfns.\n", time1 * 1.0e9); + + ret = sigaction(SIGSYS, &act, NULL); + if (ret) { + perror("Error sigaction:"); + exit(-1); + } + + fprintf(stderr, "Enabling syscall trapping.\n"); + + if (prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, + syscall_dispatcher_start, + (syscall_dispatcher_end - syscall_dispatcher_start + 1), + &selector)) { + perror("prctl failed\n"); + exit(-1); + } + + SYSCALL_BLOCK; + syscall(MAGIC_SYSCALL_1); + +#ifdef TEST_BLOCKED_RETURN + if (selector == PR_SYS_DISPATCH_OFF) { + fprintf(stderr, "Failed to return with selector blocked.\n"); + exit(-1); + } +#endif + + SYSCALL_UNBLOCK; + + if (!trapped_call_count) { + fprintf(stderr, "syscall trapping does not work.\n"); + exit(-1); + } + + time2 = perf_syscall(); + + if (native_call_count) { + perror("syscall trapping intercepted more syscalls than expected\n"); + exit(-1); + } + + printf("trapped_call_count %lu, native_call_count %lu.\n", + trapped_call_count, native_call_count); + printf("Avg syscall time %.0lfns.\n", time2 * 1.0e9); + printf("Interception overhead: %.1lf%% (+%.0lfns).\n", + 100.0 * (time2 / time1 - 1.0), 1.0e9 * (time2 - time1)); + return 0; + +} diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_test.c b/tools/testing/selftests/syscall_user_dispatch/sud_test.c new file mode 100644 index 000000000000..6498b050ef89 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/sud_test.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Collabora Ltd. + * + * Test code for syscall user dispatch + */ + +#define _GNU_SOURCE +#include <sys/prctl.h> +#include <sys/sysinfo.h> +#include <sys/syscall.h> +#include <signal.h> + +#include <asm/unistd.h> +#include "../kselftest_harness.h" + +#ifndef PR_SET_SYSCALL_USER_DISPATCH +# define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +#endif + +#ifndef SYS_USER_DISPATCH +# define SYS_USER_DISPATCH 2 +#endif + +#ifdef __NR_syscalls +# define MAGIC_SYSCALL_1 (__NR_syscalls + 1) /* Bad Linux syscall number */ +#else +# define MAGIC_SYSCALL_1 (0xff00) /* Bad Linux syscall number */ +#endif + +#define SYSCALL_DISPATCH_ON(x) ((x) = 1) +#define SYSCALL_DISPATCH_OFF(x) ((x) = 0) + +/* Test Summary: + * + * - dispatch_trigger_sigsys: Verify if PR_SET_SYSCALL_USER_DISPATCH is + * able to trigger SIGSYS on a syscall. + * + * - bad_selector: Test that a bad selector value triggers SIGSYS with + * si_errno EINVAL. + * + * - bad_prctl_param: Test that the API correctly rejects invalid + * parameters on prctl + * + * - dispatch_and_return: Test that a syscall is selectively dispatched + * to userspace depending on the value of selector. + * + * - disable_dispatch: Test that the PR_SYS_DISPATCH_OFF correctly + * disables the dispatcher + * + * - direct_dispatch_range: Test that a syscall within the allowed range + * can bypass the dispatcher. + */ + +TEST_SIGNAL(dispatch_trigger_sigsys, SIGSYS) +{ + char sel = 0; + struct sysinfo info; + int ret; + + ret = sysinfo(&info); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + SYSCALL_DISPATCH_ON(sel); + + sysinfo(&info); + + EXPECT_FALSE(true) { + TH_LOG("Unreachable!"); + } +} + +TEST(bad_prctl_param) +{ + char sel = 0; + int op; + + /* Invalid op */ + op = -1; + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0, 0, &sel); + ASSERT_EQ(EINVAL, errno); + + /* PR_SYS_DISPATCH_OFF */ + op = PR_SYS_DISPATCH_OFF; + + /* offset != 0 */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, 0); + EXPECT_EQ(EINVAL, errno); + + /* len != 0 */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0xff, 0); + EXPECT_EQ(EINVAL, errno); + + /* sel != NULL */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + + /* Valid parameter */ + errno = 0; + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, 0x0); + EXPECT_EQ(0, errno); + + /* PR_SYS_DISPATCH_ON */ + op = PR_SYS_DISPATCH_ON; + + /* Dispatcher region is bad (offset > 0 && len == 0) */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, -1L, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + + /* Invalid selector */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x1, (void *) -1); + ASSERT_EQ(EFAULT, errno); + + /* + * Dispatcher range overflows unsigned long + */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 1, -1L, &sel); + ASSERT_EQ(EINVAL, errno) { + TH_LOG("Should reject bad syscall range"); + } + + /* + * Allowed range overflows usigned long + */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, -1L, 0x1, &sel); + ASSERT_EQ(EINVAL, errno) { + TH_LOG("Should reject bad syscall range"); + } +} + +/* + * Use global selector for handle_sigsys tests, to avoid passing + * selector to signal handler + */ +char glob_sel; +int nr_syscalls_emulated; +int si_code; +int si_errno; + +static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) +{ + si_code = info->si_code; + si_errno = info->si_errno; + + if (info->si_syscall == MAGIC_SYSCALL_1) + nr_syscalls_emulated++; + + /* In preparation for sigreturn. */ + SYSCALL_DISPATCH_OFF(glob_sel); +} + +TEST(dispatch_and_return) +{ + long ret; + struct sigaction act; + sigset_t mask; + + glob_sel = 0; + nr_syscalls_emulated = 0; + si_code = 0; + si_errno = 0; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + ret = sigaction(SIGSYS, &act, NULL); + ASSERT_EQ(0, ret); + + /* Make sure selector is good prior to prctl. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + /* MAGIC_SYSCALL_1 doesn't exist. */ + SYSCALL_DISPATCH_OFF(glob_sel); + ret = syscall(MAGIC_SYSCALL_1); + EXPECT_EQ(-1, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } + + /* MAGIC_SYSCALL_1 should be emulated. */ + nr_syscalls_emulated = 0; + SYSCALL_DISPATCH_ON(glob_sel); + + ret = syscall(MAGIC_SYSCALL_1); + EXPECT_EQ(MAGIC_SYSCALL_1, ret) { + TH_LOG("Failed to intercept syscall"); + } + EXPECT_EQ(1, nr_syscalls_emulated) { + TH_LOG("Failed to emulate syscall"); + } + ASSERT_EQ(SYS_USER_DISPATCH, si_code) { + TH_LOG("Bad si_code in SIGSYS"); + } + ASSERT_EQ(0, si_errno) { + TH_LOG("Bad si_errno in SIGSYS"); + } +} + +TEST_SIGNAL(bad_selector, SIGSYS) +{ + long ret; + struct sigaction act; + sigset_t mask; + struct sysinfo info; + + glob_sel = 0; + nr_syscalls_emulated = 0; + si_code = 0; + si_errno = 0; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + ret = sigaction(SIGSYS, &act, NULL); + ASSERT_EQ(0, ret); + + /* Make sure selector is good prior to prctl. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + glob_sel = -1; + + sysinfo(&info); + + /* Even though it is ready to catch SIGSYS, the signal is + * supposed to be uncatchable. + */ + + EXPECT_FALSE(true) { + TH_LOG("Unreachable!"); + } +} + +TEST(disable_dispatch) +{ + int ret; + struct sysinfo info; + char sel = 0; + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + /* MAGIC_SYSCALL_1 doesn't exist. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_OFF, 0, 0, 0); + EXPECT_EQ(0, ret) { + TH_LOG("Failed to unset syscall user dispatch"); + } + + /* Shouldn't have any effect... */ + SYSCALL_DISPATCH_ON(glob_sel); + + ret = syscall(__NR_sysinfo, &info); + EXPECT_EQ(0, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } +} + +TEST(direct_dispatch_range) +{ + int ret = 0; + struct sysinfo info; + char sel = 0; + + /* + * Instead of calculating libc addresses; allow the entire + * memory map and lock the selector. + */ + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, -1L, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + SYSCALL_DISPATCH_ON(sel); + + ret = sysinfo(&info); + ASSERT_EQ(0, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config index c33a7aac27ff..b71828df5a6d 100644 --- a/tools/testing/selftests/tc-testing/config +++ b/tools/testing/selftests/tc-testing/config @@ -59,6 +59,7 @@ CONFIG_NET_IFE_SKBPRIO=m CONFIG_NET_IFE_SKBTCINDEX=m CONFIG_NET_SCH_FIFO=y CONFIG_NET_SCH_ETS=m +CONFIG_NET_SCH_RED=m # ## Network testing diff --git a/tools/testing/selftests/timens/procfs.c b/tools/testing/selftests/timens/procfs.c index 7f14f0fdac84..f2519154208a 100644 --- a/tools/testing/selftests/timens/procfs.c +++ b/tools/testing/selftests/timens/procfs.c @@ -93,6 +93,33 @@ static int read_proc_uptime(struct timespec *uptime) return 0; } +static int read_proc_stat_btime(unsigned long long *boottime_sec) +{ + FILE *proc; + char line_buf[2048]; + + proc = fopen("/proc/stat", "r"); + if (proc == NULL) { + pr_perror("Unable to open /proc/stat"); + return -1; + } + + while (fgets(line_buf, 2048, proc)) { + if (sscanf(line_buf, "btime %llu", boottime_sec) != 1) + continue; + fclose(proc); + return 0; + } + if (errno) { + pr_perror("fscanf"); + fclose(proc); + return -errno; + } + pr_err("failed to parse /proc/stat"); + fclose(proc); + return -1; +} + static int check_uptime(void) { struct timespec uptime_new, uptime_old; @@ -123,18 +150,47 @@ static int check_uptime(void) return 0; } +static int check_stat_btime(void) +{ + unsigned long long btime_new, btime_old; + unsigned long long btime_expected; + + if (switch_ns(parent_ns)) + return pr_err("switch_ns(%d)", parent_ns); + + if (read_proc_stat_btime(&btime_old)) + return 1; + + if (switch_ns(child_ns)) + return pr_err("switch_ns(%d)", child_ns); + + if (read_proc_stat_btime(&btime_new)) + return 1; + + btime_expected = btime_old - TEN_DAYS_IN_SEC; + if (btime_new != btime_expected) { + pr_fail("btime in /proc/stat: old %llu, new %llu [%llu]", + btime_old, btime_new, btime_expected); + return 1; + } + + ksft_test_result_pass("Passed for /proc/stat btime\n"); + return 0; +} + int main(int argc, char *argv[]) { int ret = 0; nscheck(); - ksft_set_plan(1); + ksft_set_plan(2); if (init_namespaces()) return 1; ret |= check_uptime(); + ret |= check_stat_btime(); if (ret) ksft_exit_fail(); diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 0069f2f83f86..d53a4d8008f9 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -5,13 +5,16 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu -ifeq ($(ARCH),x86) +TEST_GEN_PROGS += $(OUTPUT)/vdso_test_abi +TEST_GEN_PROGS += $(OUTPUT)/vdso_test_clock_getres +ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64)) TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86 endif +TEST_GEN_PROGS += $(OUTPUT)/vdso_test_correctness -ifndef CROSS_COMPILE CFLAGS := -std=gnu99 CFLAGS_vdso_standalone_test_x86 := -nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector +LDFLAGS_vdso_test_correctness := -ldl ifeq ($(CONFIG_X86_32),y) LDLIBS += -lgcc_s endif @@ -19,9 +22,14 @@ endif all: $(TEST_GEN_PROGS) $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c +$(OUTPUT)/vdso_test_abi: parse_vdso.c vdso_test_abi.c +$(OUTPUT)/vdso_test_clock_getres: vdso_test_clock_getres.c $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \ vdso_standalone_test_x86.c parse_vdso.c \ -o $@ - -endif +$(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c + $(CC) $(CFLAGS) \ + vdso_test_correctness.c \ + -o $@ \ + $(LDFLAGS_vdso_test_correctness) diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h new file mode 100644 index 000000000000..6a6fe8d4ff55 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * vdso_config.h: Configuration options for vDSO tests. + * Copyright (c) 2019 Arm Ltd. + */ +#ifndef __VDSO_CONFIG_H__ +#define __VDSO_CONFIG_H__ + +/* + * Each architecture exports its vDSO implementation with different names + * and a different version from the others, so we need to handle it as a + * special case. + */ +#if defined(__arm__) +#define VDSO_VERSION 0 +#define VDSO_NAMES 1 +#define VDSO_32BIT 1 +#elif defined(__aarch64__) +#define VDSO_VERSION 3 +#define VDSO_NAMES 0 +#elif defined(__powerpc__) +#define VDSO_VERSION 1 +#define VDSO_NAMES 0 +#define VDSO_32BIT 1 +#elif defined(__powerpc64__) +#define VDSO_VERSION 1 +#define VDSO_NAMES 0 +#elif defined (__s390__) +#define VDSO_VERSION 2 +#define VDSO_NAMES 0 +#define VDSO_32BIT 1 +#elif defined (__s390X__) +#define VDSO_VERSION 2 +#define VDSO_NAMES 0 +#elif defined(__mips__) +#define VDSO_VERSION 0 +#define VDSO_NAMES 1 +#define VDSO_32BIT 1 +#elif defined(__sparc__) +#define VDSO_VERSION 0 +#define VDSO_NAMES 1 +#define VDSO_32BIT 1 +#elif defined(__i386__) +#define VDSO_VERSION 0 +#define VDSO_NAMES 1 +#define VDSO_32BIT 1 +#elif defined(__x86_64__) +#define VDSO_VERSION 0 +#define VDSO_NAMES 1 +#elif defined(__riscv__) +#define VDSO_VERSION 5 +#define VDSO_NAMES 1 +#define VDSO_32BIT 1 +#else /* nds32 */ +#define VDSO_VERSION 4 +#define VDSO_NAMES 1 +#define VDSO_32BIT 1 +#endif + +static const char *versions[6] = { + "LINUX_2.6", + "LINUX_2.6.15", + "LINUX_2.6.29", + "LINUX_2.6.39", + "LINUX_4", + "LINUX_4.15", +}; + +static const char *names[2][6] = { + { + "__kernel_gettimeofday", + "__kernel_clock_gettime", + "__kernel_time", + "__kernel_clock_getres", + "__kernel_getcpu", +#if defined(VDSO_32BIT) + "__kernel_clock_gettime64", +#endif + }, + { + "__vdso_gettimeofday", + "__vdso_clock_gettime", + "__vdso_time", + "__vdso_clock_getres", + "__vdso_getcpu", +#if defined(VDSO_32BIT) + "__vdso_clock_gettime64", +#endif + }, +}; + +#endif /* __VDSO_CONFIG_H__ */ diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c new file mode 100644 index 000000000000..3d603f1394af --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vdso_full_test.c: Sample code to test all the timers. + * Copyright (c) 2019 Arm Ltd. + * + * Compile with: + * gcc -std=gnu99 vdso_full_test.c parse_vdso.c + * + */ + +#include <stdint.h> +#include <elf.h> +#include <stdio.h> +#include <time.h> +#include <sys/auxv.h> +#include <sys/time.h> +#define _GNU_SOURCE +#include <unistd.h> +#include <sys/syscall.h> + +#include "../kselftest.h" +#include "vdso_config.h" + +extern void *vdso_sym(const char *version, const char *name); +extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); +extern void vdso_init_from_auxv(void *auxv); + +static const char *version; +static const char **name; + +typedef long (*vdso_gettimeofday_t)(struct timeval *tv, struct timezone *tz); +typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); +typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); +typedef time_t (*vdso_time_t)(time_t *t); + +static int vdso_test_gettimeofday(void) +{ + /* Find gettimeofday. */ + vdso_gettimeofday_t vdso_gettimeofday = + (vdso_gettimeofday_t)vdso_sym(version, name[0]); + + if (!vdso_gettimeofday) { + printf("Could not find %s\n", name[0]); + return KSFT_SKIP; + } + + struct timeval tv; + long ret = vdso_gettimeofday(&tv, 0); + + if (ret == 0) { + printf("The time is %lld.%06lld\n", + (long long)tv.tv_sec, (long long)tv.tv_usec); + } else { + printf("%s failed\n", name[0]); + return KSFT_FAIL; + } + + return KSFT_PASS; +} + +static int vdso_test_clock_gettime(clockid_t clk_id) +{ + /* Find clock_gettime. */ + vdso_clock_gettime_t vdso_clock_gettime = + (vdso_clock_gettime_t)vdso_sym(version, name[1]); + + if (!vdso_clock_gettime) { + printf("Could not find %s\n", name[1]); + return KSFT_SKIP; + } + + struct timespec ts; + long ret = vdso_clock_gettime(clk_id, &ts); + + if (ret == 0) { + printf("The time is %lld.%06lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + } else { + printf("%s failed\n", name[1]); + return KSFT_FAIL; + } + + return KSFT_PASS; +} + +static int vdso_test_time(void) +{ + /* Find time. */ + vdso_time_t vdso_time = + (vdso_time_t)vdso_sym(version, name[2]); + + if (!vdso_time) { + printf("Could not find %s\n", name[2]); + return KSFT_SKIP; + } + + long ret = vdso_time(NULL); + + if (ret > 0) { + printf("The time in hours since January 1, 1970 is %lld\n", + (long long)(ret / 3600)); + } else { + printf("%s failed\n", name[2]); + return KSFT_FAIL; + } + + return KSFT_PASS; +} + +static int vdso_test_clock_getres(clockid_t clk_id) +{ + /* Find clock_getres. */ + vdso_clock_getres_t vdso_clock_getres = + (vdso_clock_getres_t)vdso_sym(version, name[3]); + + if (!vdso_clock_getres) { + printf("Could not find %s\n", name[3]); + return KSFT_SKIP; + } + + struct timespec ts, sys_ts; + long ret = vdso_clock_getres(clk_id, &ts); + + if (ret == 0) { + printf("The resolution is %lld %lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + } else { + printf("%s failed\n", name[3]); + return KSFT_FAIL; + } + + ret = syscall(SYS_clock_getres, clk_id, &sys_ts); + + if ((sys_ts.tv_sec != ts.tv_sec) || (sys_ts.tv_nsec != ts.tv_nsec)) { + printf("%s failed\n", name[3]); + return KSFT_FAIL; + } + + return KSFT_PASS; +} + +const char *vdso_clock_name[12] = { + "CLOCK_REALTIME", + "CLOCK_MONOTONIC", + "CLOCK_PROCESS_CPUTIME_ID", + "CLOCK_THREAD_CPUTIME_ID", + "CLOCK_MONOTONIC_RAW", + "CLOCK_REALTIME_COARSE", + "CLOCK_MONOTONIC_COARSE", + "CLOCK_BOOTTIME", + "CLOCK_REALTIME_ALARM", + "CLOCK_BOOTTIME_ALARM", + "CLOCK_SGI_CYCLE", + "CLOCK_TAI", +}; + +/* + * This function calls vdso_test_clock_gettime and vdso_test_clock_getres + * with different values for clock_id. + */ +static inline int vdso_test_clock(clockid_t clock_id) +{ + int ret0, ret1; + + ret0 = vdso_test_clock_gettime(clock_id); + /* A skipped test is considered passed */ + if (ret0 == KSFT_SKIP) + ret0 = KSFT_PASS; + + ret1 = vdso_test_clock_getres(clock_id); + /* A skipped test is considered passed */ + if (ret1 == KSFT_SKIP) + ret1 = KSFT_PASS; + + ret0 += ret1; + + printf("clock_id: %s", vdso_clock_name[clock_id]); + + if (ret0 > 0) + printf(" [FAIL]\n"); + else + printf(" [PASS]\n"); + + return ret0; +} + +int main(int argc, char **argv) +{ + unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + int ret; + + if (!sysinfo_ehdr) { + printf("AT_SYSINFO_EHDR is not present!\n"); + return KSFT_SKIP; + } + + version = versions[VDSO_VERSION]; + name = (const char **)&names[VDSO_NAMES]; + + printf("[vDSO kselftest] VDSO_VERSION: %s\n", version); + + vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); + + ret = vdso_test_gettimeofday(); + +#if _POSIX_TIMERS > 0 + +#ifdef CLOCK_REALTIME + ret += vdso_test_clock(CLOCK_REALTIME); +#endif + +#ifdef CLOCK_BOOTTIME + ret += vdso_test_clock(CLOCK_BOOTTIME); +#endif + +#ifdef CLOCK_TAI + ret += vdso_test_clock(CLOCK_TAI); +#endif + +#ifdef CLOCK_REALTIME_COARSE + ret += vdso_test_clock(CLOCK_REALTIME_COARSE); +#endif + +#ifdef CLOCK_MONOTONIC + ret += vdso_test_clock(CLOCK_MONOTONIC); +#endif + +#ifdef CLOCK_MONOTONIC_RAW + ret += vdso_test_clock(CLOCK_MONOTONIC_RAW); +#endif + +#ifdef CLOCK_MONOTONIC_COARSE + ret += vdso_test_clock(CLOCK_MONOTONIC_COARSE); +#endif + +#endif + + ret += vdso_test_time(); + + if (ret > 0) + return KSFT_FAIL; + + return KSFT_PASS; +} diff --git a/tools/testing/selftests/vDSO/vdso_test_clock_getres.c b/tools/testing/selftests/vDSO/vdso_test_clock_getres.c new file mode 100644 index 000000000000..15dcee16ff72 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_test_clock_getres.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +/* + * vdso_clock_getres.c: Sample code to test clock_getres. + * Copyright (c) 2019 Arm Ltd. + * + * Compile with: + * gcc -std=gnu99 vdso_clock_getres.c + * + * Tested on ARM, ARM64, MIPS32, x86 (32-bit and 64-bit), + * Power (32-bit and 64-bit), S390x (32-bit and 64-bit). + * Might work on other architectures. + */ + +#define _GNU_SOURCE +#include <elf.h> +#include <err.h> +#include <fcntl.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <sys/auxv.h> +#include <sys/mman.h> +#include <sys/time.h> +#include <unistd.h> +#include <sys/syscall.h> + +#include "../kselftest.h" + +static long syscall_clock_getres(clockid_t _clkid, struct timespec *_ts) +{ + long ret; + + ret = syscall(SYS_clock_getres, _clkid, _ts); + + return ret; +} + +const char *vdso_clock_name[12] = { + "CLOCK_REALTIME", + "CLOCK_MONOTONIC", + "CLOCK_PROCESS_CPUTIME_ID", + "CLOCK_THREAD_CPUTIME_ID", + "CLOCK_MONOTONIC_RAW", + "CLOCK_REALTIME_COARSE", + "CLOCK_MONOTONIC_COARSE", + "CLOCK_BOOTTIME", + "CLOCK_REALTIME_ALARM", + "CLOCK_BOOTTIME_ALARM", + "CLOCK_SGI_CYCLE", + "CLOCK_TAI", +}; + +/* + * This function calls clock_getres in vdso and by system call + * with different values for clock_id. + * + * Example of output: + * + * clock_id: CLOCK_REALTIME [PASS] + * clock_id: CLOCK_BOOTTIME [PASS] + * clock_id: CLOCK_TAI [PASS] + * clock_id: CLOCK_REALTIME_COARSE [PASS] + * clock_id: CLOCK_MONOTONIC [PASS] + * clock_id: CLOCK_MONOTONIC_RAW [PASS] + * clock_id: CLOCK_MONOTONIC_COARSE [PASS] + */ +static inline int vdso_test_clock(unsigned int clock_id) +{ + struct timespec x, y; + + printf("clock_id: %s", vdso_clock_name[clock_id]); + clock_getres(clock_id, &x); + syscall_clock_getres(clock_id, &y); + + if ((x.tv_sec != y.tv_sec) || (x.tv_nsec != y.tv_nsec)) { + printf(" [FAIL]\n"); + return KSFT_FAIL; + } + + printf(" [PASS]\n"); + return KSFT_PASS; +} + +int main(int argc, char **argv) +{ + int ret; + +#if _POSIX_TIMERS > 0 + +#ifdef CLOCK_REALTIME + ret = vdso_test_clock(CLOCK_REALTIME); +#endif + +#ifdef CLOCK_BOOTTIME + ret += vdso_test_clock(CLOCK_BOOTTIME); +#endif + +#ifdef CLOCK_TAI + ret += vdso_test_clock(CLOCK_TAI); +#endif + +#ifdef CLOCK_REALTIME_COARSE + ret += vdso_test_clock(CLOCK_REALTIME_COARSE); +#endif + +#ifdef CLOCK_MONOTONIC + ret += vdso_test_clock(CLOCK_MONOTONIC); +#endif + +#ifdef CLOCK_MONOTONIC_RAW + ret += vdso_test_clock(CLOCK_MONOTONIC_RAW); +#endif + +#ifdef CLOCK_MONOTONIC_COARSE + ret += vdso_test_clock(CLOCK_MONOTONIC_COARSE); +#endif + +#endif + if (ret > 0) + return KSFT_FAIL; + + return KSFT_PASS; +} diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/vDSO/vdso_test_correctness.c index 42052db0f870..5029ef9b228c 100644 --- a/tools/testing/selftests/x86/test_vdso.c +++ b/tools/testing/selftests/vDSO/vdso_test_correctness.c @@ -19,6 +19,10 @@ #include <stdbool.h> #include <limits.h> +#include "vdso_config.h" + +static const char **name; + #ifndef SYS_getcpu # ifdef __x86_64__ # define SYS_getcpu 309 @@ -27,6 +31,17 @@ # endif #endif +#ifndef __NR_clock_gettime64 +#define __NR_clock_gettime64 403 +#endif + +#ifndef __kernel_timespec +struct __kernel_timespec { + long long tv_sec; + long long tv_nsec; +}; +#endif + /* max length of lines in /proc/self/maps - anything longer is skipped here */ #define MAPS_LINE_LEN 128 @@ -36,6 +51,10 @@ typedef int (*vgettime_t)(clockid_t, struct timespec *); vgettime_t vdso_clock_gettime; +typedef int (*vgettime64_t)(clockid_t, struct __kernel_timespec *); + +vgettime64_t vdso_clock_gettime64; + typedef long (*vgtod_t)(struct timeval *tv, struct timezone *tz); vgtod_t vdso_gettimeofday; @@ -99,17 +118,23 @@ static void fill_function_pointers() return; } - vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu"); + vdso_getcpu = (getcpu_t)dlsym(vdso, name[4]); if (!vdso_getcpu) printf("Warning: failed to find getcpu in vDSO\n"); vgetcpu = (getcpu_t) vsyscall_getcpu(); - vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); + vdso_clock_gettime = (vgettime_t)dlsym(vdso, name[1]); if (!vdso_clock_gettime) printf("Warning: failed to find clock_gettime in vDSO\n"); - vdso_gettimeofday = (vgtod_t)dlsym(vdso, "__vdso_gettimeofday"); +#if defined(VDSO_32BIT) + vdso_clock_gettime64 = (vgettime64_t)dlsym(vdso, name[5]); + if (!vdso_clock_gettime64) + printf("Warning: failed to find clock_gettime64 in vDSO\n"); +#endif + + vdso_gettimeofday = (vgtod_t)dlsym(vdso, name[0]); if (!vdso_gettimeofday) printf("Warning: failed to find gettimeofday in vDSO\n"); @@ -126,6 +151,11 @@ static inline int sys_clock_gettime(clockid_t id, struct timespec *ts) return syscall(__NR_clock_gettime, id, ts); } +static inline int sys_clock_gettime64(clockid_t id, struct __kernel_timespec *ts) +{ + return syscall(__NR_clock_gettime64, id, ts); +} + static inline int sys_gettimeofday(struct timeval *tv, struct timezone *tz) { return syscall(__NR_gettimeofday, tv, tz); @@ -191,6 +221,15 @@ static bool ts_leq(const struct timespec *a, const struct timespec *b) return a->tv_nsec <= b->tv_nsec; } +static bool ts64_leq(const struct __kernel_timespec *a, + const struct __kernel_timespec *b) +{ + if (a->tv_sec != b->tv_sec) + return a->tv_sec < b->tv_sec; + else + return a->tv_nsec <= b->tv_nsec; +} + static bool tv_leq(const struct timeval *a, const struct timeval *b) { if (a->tv_sec != b->tv_sec) @@ -254,7 +293,10 @@ static void test_one_clock_gettime(int clock, const char *name) if (!ts_leq(&start, &vdso) || !ts_leq(&vdso, &end)) { printf("[FAIL]\tTimes are out of sequence\n"); nerrs++; + return; } + + printf("[OK]\tTest Passed.\n"); } static void test_clock_gettime(void) @@ -275,6 +317,70 @@ static void test_clock_gettime(void) test_one_clock_gettime(INT_MAX, "invalid"); } +static void test_one_clock_gettime64(int clock, const char *name) +{ + struct __kernel_timespec start, vdso, end; + int vdso_ret, end_ret; + + printf("[RUN]\tTesting clock_gettime64 for clock %s (%d)...\n", name, clock); + + if (sys_clock_gettime64(clock, &start) < 0) { + if (errno == EINVAL) { + vdso_ret = vdso_clock_gettime64(clock, &vdso); + if (vdso_ret == -EINVAL) { + printf("[OK]\tNo such clock.\n"); + } else { + printf("[FAIL]\tNo such clock, but __vdso_clock_gettime64 returned %d\n", vdso_ret); + nerrs++; + } + } else { + printf("[WARN]\t clock_gettime64(%d) syscall returned error %d\n", clock, errno); + } + return; + } + + vdso_ret = vdso_clock_gettime64(clock, &vdso); + end_ret = sys_clock_gettime64(clock, &end); + + if (vdso_ret != 0 || end_ret != 0) { + printf("[FAIL]\tvDSO returned %d, syscall errno=%d\n", + vdso_ret, errno); + nerrs++; + return; + } + + printf("\t%llu.%09ld %llu.%09ld %llu.%09ld\n", + (unsigned long long)start.tv_sec, start.tv_nsec, + (unsigned long long)vdso.tv_sec, vdso.tv_nsec, + (unsigned long long)end.tv_sec, end.tv_nsec); + + if (!ts64_leq(&start, &vdso) || !ts64_leq(&vdso, &end)) { + printf("[FAIL]\tTimes are out of sequence\n"); + nerrs++; + return; + } + + printf("[OK]\tTest Passed.\n"); +} + +static void test_clock_gettime64(void) +{ + if (!vdso_clock_gettime64) { + printf("[SKIP]\tNo vDSO, so skipping clock_gettime64() tests\n"); + return; + } + + for (int clock = 0; clock < sizeof(clocknames) / sizeof(clocknames[0]); + clock++) { + test_one_clock_gettime64(clock, clocknames[clock]); + } + + /* Also test some invalid clock ids */ + test_one_clock_gettime64(-1, "invalid"); + test_one_clock_gettime64(INT_MIN, "invalid"); + test_one_clock_gettime64(INT_MAX, "invalid"); +} + static void test_gettimeofday(void) { struct timeval start, vdso, end; @@ -327,9 +433,12 @@ static void test_gettimeofday(void) int main(int argc, char **argv) { + name = (const char **)&names[VDSO_NAMES]; + fill_function_pointers(); test_clock_gettime(); + test_clock_gettime64(); test_gettimeofday(); /* diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 849e8226395a..9a35c3f6a557 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -8,6 +8,7 @@ thuge-gen compaction_test mlock2-tests mremap_dontunmap +mremap_test on-fault-limit transhuge-stress protection_keys @@ -15,8 +16,9 @@ userfaultfd mlock-intersect-test mlock-random-test virtual_address_range -gup_benchmark +gup_test va_128TBswitch map_fixed_noreplace write_to_hugetlbfs hmm-tests +local_config.* diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 30873b19d04b..9a25307f6115 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for vm selftests + +include local_config.mk + uname_M := $(shell uname -m 2>/dev/null || echo not) MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/') @@ -21,23 +24,24 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/') MAKEFLAGS += --no-builtin-rules CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) -LDLIBS = -lrt +LDLIBS = -lrt -lpthread TEST_GEN_FILES = compaction_test -TEST_GEN_FILES += gup_benchmark +TEST_GEN_FILES += gup_test TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm -TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += khugepaged TEST_GEN_FILES += map_fixed_noreplace +TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += mremap_dontunmap +TEST_GEN_FILES += mremap_test TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd -TEST_GEN_FILES += khugepaged ifeq ($(ARCH),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) @@ -60,24 +64,26 @@ ifeq ($(CAN_BUILD_X86_64),1) TEST_GEN_FILES += $(BINARIES_64) endif else + +ifneq (,$(findstring $(ARCH),powerpc)) TEST_GEN_FILES += protection_keys endif +endif + ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch TEST_GEN_FILES += virtual_address_range TEST_GEN_FILES += write_to_hugetlbfs endif -TEST_PROGS := run_vmtests +TEST_PROGS := run_vmtests.sh TEST_FILES := test_vmalloc.sh KSFT_KHDR_INSTALL := 1 include ../lib.mk -$(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread - ifeq ($(ARCH),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) @@ -127,6 +133,25 @@ warn_32bit_failure: endif endif -$(OUTPUT)/userfaultfd: LDLIBS += -lpthread - $(OUTPUT)/mlock-random-test: LDLIBS += -lcap + +$(OUTPUT)/gup_test: ../../../../mm/gup_test.h + +$(OUTPUT)/hmm-tests: local_config.h + +# HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. +$(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS) + +local_config.mk local_config.h: check_config.sh + /bin/sh ./check_config.sh $(CC) + +EXTRA_CLEAN += local_config.mk local_config.h + +ifeq ($(HMM_EXTRA_LIBS),) +all: warn_missing_hugelibs + +warn_missing_hugelibs: + @echo ; \ + echo "Warning: missing libhugetlbfs support. Some HMM tests will be skipped." ; \ + echo +endif diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh new file mode 100644 index 000000000000..079c8a40b85d --- /dev/null +++ b/tools/testing/selftests/vm/check_config.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Probe for libraries and create header files to record the results. Both C +# header files and Makefile include fragments are created. + +OUTPUT_H_FILE=local_config.h +OUTPUT_MKFILE=local_config.mk + +# libhugetlbfs +tmpname=$(mktemp) +tmpfile_c=${tmpname}.c +tmpfile_o=${tmpname}.o + +echo "#include <sys/types.h>" > $tmpfile_c +echo "#include <hugetlbfs.h>" >> $tmpfile_c +echo "int func(void) { return 0; }" >> $tmpfile_c + +CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"} +$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 + +if [ -f $tmpfile_o ]; then + echo "#define LOCAL_CONFIG_HAVE_LIBHUGETLBFS 1" > $OUTPUT_H_FILE + echo "HMM_EXTRA_LIBS = -lhugetlbfs" > $OUTPUT_MKFILE +else + echo "// No libhugetlbfs support found" > $OUTPUT_H_FILE + echo "# No libhugetlbfs support found, so:" > $OUTPUT_MKFILE + echo "HMM_EXTRA_LIBS = " >> $OUTPUT_MKFILE +fi + +rm ${tmpname}.* diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config index 69dd0d1aa30b..60e82da0de85 100644 --- a/tools/testing/selftests/vm/config +++ b/tools/testing/selftests/vm/config @@ -3,4 +3,4 @@ CONFIG_USERFAULTFD=y CONFIG_TEST_VMALLOC=m CONFIG_DEVICE_PRIVATE=y CONFIG_TEST_HMM=m -CONFIG_GUP_BENCHMARK=y +CONFIG_GUP_TEST=y diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c deleted file mode 100644 index 1d4359341e44..000000000000 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ /dev/null @@ -1,143 +0,0 @@ -#include <fcntl.h> -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> - -#include <sys/ioctl.h> -#include <sys/mman.h> -#include <sys/prctl.h> -#include <sys/stat.h> -#include <sys/types.h> - -#include <linux/types.h> - -#define MB (1UL << 20) -#define PAGE_SIZE sysconf(_SC_PAGESIZE) - -#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) -#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark) - -/* Similar to above, but use FOLL_PIN instead of FOLL_GET. */ -#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark) -#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark) -#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark) - -/* Just the flags we need, copied from mm.h: */ -#define FOLL_WRITE 0x01 /* check pte is writable */ - -struct gup_benchmark { - __u64 get_delta_usec; - __u64 put_delta_usec; - __u64 addr; - __u64 size; - __u32 nr_pages_per_call; - __u32 flags; - __u64 expansion[10]; /* For future use */ -}; - -int main(int argc, char **argv) -{ - struct gup_benchmark gup; - unsigned long size = 128 * MB; - int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; - int cmd = GUP_FAST_BENCHMARK, flags = MAP_PRIVATE; - char *file = "/dev/zero"; - char *p; - - while ((opt = getopt(argc, argv, "m:r:n:f:abtTLUuwSH")) != -1) { - switch (opt) { - case 'a': - cmd = PIN_FAST_BENCHMARK; - break; - case 'b': - cmd = PIN_BENCHMARK; - break; - case 'L': - cmd = PIN_LONGTERM_BENCHMARK; - break; - case 'm': - size = atoi(optarg) * MB; - break; - case 'r': - repeats = atoi(optarg); - break; - case 'n': - nr_pages = atoi(optarg); - break; - case 't': - thp = 1; - break; - case 'T': - thp = 0; - break; - case 'U': - cmd = GUP_BENCHMARK; - break; - case 'u': - cmd = GUP_FAST_BENCHMARK; - break; - case 'w': - write = 1; - break; - case 'f': - file = optarg; - break; - case 'S': - flags &= ~MAP_PRIVATE; - flags |= MAP_SHARED; - break; - case 'H': - flags |= (MAP_HUGETLB | MAP_ANONYMOUS); - break; - default: - return -1; - } - } - - filed = open(file, O_RDWR|O_CREAT); - if (filed < 0) { - perror("open"); - exit(filed); - } - - gup.nr_pages_per_call = nr_pages; - if (write) - gup.flags |= FOLL_WRITE; - - fd = open("/sys/kernel/debug/gup_benchmark", O_RDWR); - if (fd == -1) { - perror("open"); - exit(1); - } - - p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); - if (p == MAP_FAILED) { - perror("mmap"); - exit(1); - } - gup.addr = (unsigned long)p; - - if (thp == 1) - madvise(p, size, MADV_HUGEPAGE); - else if (thp == 0) - madvise(p, size, MADV_NOHUGEPAGE); - - for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) - p[0] = 0; - - for (i = 0; i < repeats; i++) { - gup.size = size; - if (ioctl(fd, cmd, &gup)) { - perror("ioctl"); - exit(1); - } - - printf("Time: get:%lld put:%lld us", gup.get_delta_usec, - gup.put_delta_usec); - if (gup.size != size) - printf(", truncated (size: %lld)", gup.size); - printf("\n"); - } - - return 0; -} diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c new file mode 100644 index 000000000000..6c6336dd3b7f --- /dev/null +++ b/tools/testing/selftests/vm/gup_test.c @@ -0,0 +1,194 @@ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include "../../../../mm/gup_test.h" + +#define MB (1UL << 20) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) + +/* Just the flags we need, copied from mm.h: */ +#define FOLL_WRITE 0x01 /* check pte is writable */ + +static char *cmd_to_str(unsigned long cmd) +{ + switch (cmd) { + case GUP_FAST_BENCHMARK: + return "GUP_FAST_BENCHMARK"; + case PIN_FAST_BENCHMARK: + return "PIN_FAST_BENCHMARK"; + case PIN_LONGTERM_BENCHMARK: + return "PIN_LONGTERM_BENCHMARK"; + case GUP_BASIC_TEST: + return "GUP_BASIC_TEST"; + case PIN_BASIC_TEST: + return "PIN_BASIC_TEST"; + case DUMP_USER_PAGES_TEST: + return "DUMP_USER_PAGES_TEST"; + } + return "Unknown command"; +} + +int main(int argc, char **argv) +{ + struct gup_test gup = { 0 }; + unsigned long size = 128 * MB; + int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; + unsigned long cmd = GUP_FAST_BENCHMARK; + int flags = MAP_PRIVATE; + char *file = "/dev/zero"; + char *p; + + while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwSH")) != -1) { + switch (opt) { + case 'a': + cmd = PIN_FAST_BENCHMARK; + break; + case 'b': + cmd = PIN_BASIC_TEST; + break; + case 'L': + cmd = PIN_LONGTERM_BENCHMARK; + break; + case 'c': + cmd = DUMP_USER_PAGES_TEST; + /* + * Dump page 0 (index 1). May be overridden later, by + * user's non-option arguments. + * + * .which_pages is zero-based, so that zero can mean "do + * nothing". + */ + gup.which_pages[0] = 1; + break; + case 'F': + /* strtol, so you can pass flags in hex form */ + gup.flags = strtol(optarg, 0, 0); + break; + case 'm': + size = atoi(optarg) * MB; + break; + case 'r': + repeats = atoi(optarg); + break; + case 'n': + nr_pages = atoi(optarg); + break; + case 't': + thp = 1; + break; + case 'T': + thp = 0; + break; + case 'U': + cmd = GUP_BASIC_TEST; + break; + case 'u': + cmd = GUP_FAST_BENCHMARK; + break; + case 'w': + write = 1; + break; + case 'f': + file = optarg; + break; + case 'S': + flags &= ~MAP_PRIVATE; + flags |= MAP_SHARED; + break; + case 'H': + flags |= (MAP_HUGETLB | MAP_ANONYMOUS); + break; + default: + return -1; + } + } + + if (optind < argc) { + int extra_arg_count = 0; + /* + * For example: + * + * ./gup_test -c 0 1 0x1001 + * + * ...to dump pages 0, 1, and 4097 + */ + + while ((optind < argc) && + (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) { + /* + * Do the 1-based indexing here, so that the user can + * use normal 0-based indexing on the command line. + */ + long page_index = strtol(argv[optind], 0, 0) + 1; + + gup.which_pages[extra_arg_count] = page_index; + extra_arg_count++; + optind++; + } + } + + filed = open(file, O_RDWR|O_CREAT); + if (filed < 0) { + perror("open"); + exit(filed); + } + + gup.nr_pages_per_call = nr_pages; + if (write) + gup.flags |= FOLL_WRITE; + + fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (fd == -1) { + perror("open"); + exit(1); + } + + p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); + if (p == MAP_FAILED) { + perror("mmap"); + exit(1); + } + gup.addr = (unsigned long)p; + + if (thp == 1) + madvise(p, size, MADV_HUGEPAGE); + else if (thp == 0) + madvise(p, size, MADV_NOHUGEPAGE); + + for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) + p[0] = 0; + + /* Only report timing information on the *_BENCHMARK commands: */ + if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || + (cmd == PIN_LONGTERM_BENCHMARK)) { + for (i = 0; i < repeats; i++) { + gup.size = size; + if (ioctl(fd, cmd, &gup)) + perror("ioctl"), exit(1); + + printf("%s: Time: get:%lld put:%lld us", + cmd_to_str(cmd), gup.get_delta_usec, + gup.put_delta_usec); + if (gup.size != size) + printf(", truncated (size: %lld)", gup.size); + printf("\n"); + } + } else { + gup.size = size; + if (ioctl(fd, cmd, &gup)) { + perror("ioctl"); + exit(1); + } + + printf("%s: done\n", cmd_to_str(cmd)); + if (gup.size != size) + printf("Truncated (size: %lld)\n", gup.size); + } + + return 0; +} diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index c9404ef9698e..5d1ac691b9f4 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -21,12 +21,16 @@ #include <strings.h> #include <time.h> #include <pthread.h> -#include <hugetlbfs.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/mman.h> #include <sys/ioctl.h> +#include "./local_config.h" +#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS +#include <hugetlbfs.h> +#endif + /* * This is a private UAPI to the kernel test module so it isn't exported * in the usual include/uapi/... directory. @@ -662,6 +666,7 @@ TEST_F(hmm, anon_write_huge) hmm_buffer_free(buffer); } +#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS /* * Write huge TLBFS page. */ @@ -720,6 +725,7 @@ TEST_F(hmm, anon_write_hugetlbfs) buffer->ptr = NULL; hmm_buffer_free(buffer); } +#endif /* LOCAL_CONFIG_HAVE_LIBHUGETLBFS */ /* * Read mmap'ed file memory. @@ -1336,6 +1342,7 @@ TEST_F(hmm2, snapshot) hmm_buffer_free(buffer); } +#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS /* * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that * should be mapped by a large page table entry. @@ -1411,6 +1418,7 @@ TEST_F(hmm, compound) buffer->ptr = NULL; hmm_buffer_free(buffer); } +#endif /* LOCAL_CONFIG_HAVE_LIBHUGETLBFS */ /* * Test two devices reading the same memory (double mapped). diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c new file mode 100644 index 000000000000..9c391d016922 --- /dev/null +++ b/tools/testing/selftests/vm/mremap_test.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Google LLC + */ +#define _GNU_SOURCE + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <time.h> + +#include "../kselftest.h" + +#define EXPECT_SUCCESS 0 +#define EXPECT_FAILURE 1 +#define NON_OVERLAPPING 0 +#define OVERLAPPING 1 +#define NS_PER_SEC 1000000000ULL +#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ +#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) + +struct config { + unsigned long long src_alignment; + unsigned long long dest_alignment; + unsigned long long region_size; + int overlapping; +}; + +struct test { + const char *name; + struct config config; + int expect_failure; +}; + +enum { + _1KB = 1ULL << 10, /* 1KB -> not page aligned */ + _4KB = 4ULL << 10, + _8KB = 8ULL << 10, + _1MB = 1ULL << 20, + _2MB = 2ULL << 20, + _4MB = 4ULL << 20, + _1GB = 1ULL << 30, + _2GB = 2ULL << 30, + PTE = _4KB, + PMD = _2MB, + PUD = _1GB, +}; + +#define MAKE_TEST(source_align, destination_align, size, \ + overlaps, should_fail, test_name) \ +{ \ + .name = test_name, \ + .config = { \ + .src_alignment = source_align, \ + .dest_alignment = destination_align, \ + .region_size = size, \ + .overlapping = overlaps, \ + }, \ + .expect_failure = should_fail \ +} + +/* + * Returns the start address of the mapping on success, else returns + * NULL on failure. + */ +static void *get_source_mapping(struct config c) +{ + unsigned long long addr = 0ULL; + void *src_addr = NULL; +retry: + addr += c.src_alignment; + src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (src_addr == MAP_FAILED) { + if (errno == EPERM) + goto retry; + goto error; + } + /* + * Check that the address is aligned to the specified alignment. + * Addresses which have alignments that are multiples of that + * specified are not considered valid. For instance, 1GB address is + * 2MB-aligned, however it will not be considered valid for a + * requested alignment of 2MB. This is done to reduce coincidental + * alignment in the tests. + */ + if (((unsigned long long) src_addr & (c.src_alignment - 1)) || + !((unsigned long long) src_addr & c.src_alignment)) + goto retry; + + if (!src_addr) + goto error; + + return src_addr; +error: + ksft_print_msg("Failed to map source region: %s\n", + strerror(errno)); + return NULL; +} + +/* Returns the time taken for the remap on success else returns -1. */ +static long long remap_region(struct config c, unsigned int threshold_mb, + char pattern_seed) +{ + void *addr, *src_addr, *dest_addr; + unsigned long long i; + struct timespec t_start = {0, 0}, t_end = {0, 0}; + long long start_ns, end_ns, align_mask, ret, offset; + unsigned long long threshold; + + if (threshold_mb == VALIDATION_NO_THRESHOLD) + threshold = c.region_size; + else + threshold = MIN(threshold_mb * _1MB, c.region_size); + + src_addr = get_source_mapping(c); + if (!src_addr) { + ret = -1; + goto out; + } + + /* Set byte pattern */ + srand(pattern_seed); + for (i = 0; i < threshold; i++) + memset((char *) src_addr + i, (char) rand(), 1); + + /* Mask to zero out lower bits of address for alignment */ + align_mask = ~(c.dest_alignment - 1); + /* Offset of destination address from the end of the source region */ + offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment; + addr = (void *) (((unsigned long long) src_addr + c.region_size + + offset) & align_mask); + + /* See comment in get_source_mapping() */ + if (!((unsigned long long) addr & c.dest_alignment)) + addr = (void *) ((unsigned long long) addr | c.dest_alignment); + + clock_gettime(CLOCK_MONOTONIC, &t_start); + dest_addr = mremap(src_addr, c.region_size, c.region_size, + MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); + clock_gettime(CLOCK_MONOTONIC, &t_end); + + if (dest_addr == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + ret = -1; + goto clean_up_src; + } + + /* Verify byte pattern after remapping */ + srand(pattern_seed); + for (i = 0; i < threshold; i++) { + char c = (char) rand(); + + if (((char *) dest_addr)[i] != c) { + ksft_print_msg("Data after remap doesn't match at offset %d\n", + i); + ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + ((char *) dest_addr)[i] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + + start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; + end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; + ret = end_ns - start_ns; + +/* + * Since the destination address is specified using MREMAP_FIXED, subsequent + * mremap will unmap any previous mapping at the address range specified by + * dest_addr and region_size. This significantly affects the remap time of + * subsequent tests. So we clean up mappings after each test. + */ +clean_up_dest: + munmap(dest_addr, c.region_size); +clean_up_src: + munmap(src_addr, c.region_size); +out: + return ret; +} + +static void run_mremap_test_case(struct test test_case, int *failures, + unsigned int threshold_mb, + unsigned int pattern_seed) +{ + long long remap_time = remap_region(test_case.config, threshold_mb, + pattern_seed); + + if (remap_time < 0) { + if (test_case.expect_failure) + ksft_test_result_pass("%s\n\tExpected mremap failure\n", + test_case.name); + else { + ksft_test_result_fail("%s\n", test_case.name); + *failures += 1; + } + } else { + /* + * Comparing mremap time is only applicable if entire region + * was faulted in. + */ + if (threshold_mb == VALIDATION_NO_THRESHOLD || + test_case.config.region_size <= threshold_mb * _1MB) + ksft_test_result_pass("%s\n\tmremap time: %12lldns\n", + test_case.name, remap_time); + else + ksft_test_result_pass("%s\n", test_case.name); + } +} + +static void usage(const char *cmd) +{ + fprintf(stderr, + "Usage: %s [[-t <threshold_mb>] [-p <pattern_seed>]]\n" + "-t\t only validate threshold_mb of the remapped region\n" + " \t if 0 is supplied no threshold is used; all tests\n" + " \t are run and remapped regions validated fully.\n" + " \t The default threshold used is 4MB.\n" + "-p\t provide a seed to generate the random pattern for\n" + " \t validating the remapped region.\n", cmd); +} + +static int parse_args(int argc, char **argv, unsigned int *threshold_mb, + unsigned int *pattern_seed) +{ + const char *optstr = "t:p:"; + int opt; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 't': + *threshold_mb = atoi(optarg); + break; + case 'p': + *pattern_seed = atoi(optarg); + break; + default: + usage(argv[0]); + return -1; + } + } + + if (optind < argc) { + usage(argv[0]); + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + int failures = 0; + int i, run_perf_tests; + unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; + unsigned int pattern_seed; + time_t t; + + pattern_seed = (unsigned int) time(&t); + + if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0) + exit(EXIT_FAILURE); + + ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", + threshold_mb, pattern_seed); + + struct test test_cases[] = { + /* Expected mremap failures */ + MAKE_TEST(_4KB, _4KB, _4KB, OVERLAPPING, EXPECT_FAILURE, + "mremap - Source and Destination Regions Overlapping"), + MAKE_TEST(_4KB, _1KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Destination Address Misaligned (1KB-aligned)"), + MAKE_TEST(_1KB, _4KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Source Address Misaligned (1KB-aligned)"), + + /* Src addr PTE aligned */ + MAKE_TEST(PTE, PTE, _8KB, NON_OVERLAPPING, EXPECT_SUCCESS, + "8KB mremap - Source PTE-aligned, Destination PTE-aligned"), + + /* Src addr 1MB aligned */ + MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"), + MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"), + + /* Src addr PMD aligned */ + MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PTE-aligned"), + MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"), + MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PMD-aligned"), + + /* Src addr PUD aligned */ + MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PTE-aligned"), + MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"), + MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PMD-aligned"), + MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PUD-aligned"), + }; + + struct test perf_test_cases[] = { + /* + * mremap 1GB region - Page table level aligned time + * comparison. + */ + MAKE_TEST(PTE, PTE, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PTE-aligned, Destination PTE-aligned"), + MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PMD-aligned, Destination PMD-aligned"), + MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PUD-aligned, Destination PUD-aligned"), + }; + + run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) || + (threshold_mb * _1MB >= _1GB); + + ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? + ARRAY_SIZE(perf_test_cases) : 0)); + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) + run_mremap_test_case(test_cases[i], &failures, threshold_mb, + pattern_seed); + + if (run_perf_tests) { + ksft_print_msg("\n%s\n", + "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); + for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) + run_mremap_test_case(perf_test_cases[i], &failures, + threshold_mb, pattern_seed); + } + + if (failures > 0) + ksft_exit_fail(); + else + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index a3f4f30f0a2e..e953f3cd9664 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -123,10 +123,10 @@ else echo "[PASS]" fi -echo "--------------------------------------------" -echo "running 'gup_benchmark -U' (normal/slow gup)" -echo "--------------------------------------------" -./gup_benchmark -U +echo "------------------------------------------------------" +echo "running: gup_test -u # get_user_pages_fast() benchmark" +echo "------------------------------------------------------" +./gup_test -u if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 @@ -134,10 +134,22 @@ else echo "[PASS]" fi -echo "------------------------------------------" -echo "running gup_benchmark -b (pin_user_pages)" -echo "------------------------------------------" -./gup_benchmark -b +echo "------------------------------------------------------" +echo "running: gup_test -a # pin_user_pages_fast() benchmark" +echo "------------------------------------------------------" +./gup_test -a +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + +echo "------------------------------------------------------------" +echo "# Dump pages 0, 19, and 4096, using pin_user_pages:" +echo "running: gup_test -ct -F 0x1 0 19 0x1000 # dump_page() test" +echo "------------------------------------------------------------" +./gup_test -ct -F 0x1 0 19 0x1000 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 @@ -148,7 +160,7 @@ fi echo "-------------------" echo "running userfaultfd" echo "-------------------" -./userfaultfd anon 128 32 +./userfaultfd anon 20 16 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 @@ -173,7 +185,7 @@ rm -f $mnt/ufd_test_file echo "-------------------------" echo "running userfaultfd_shmem" echo "-------------------------" -./userfaultfd shmem 128 32 +./userfaultfd shmem 20 16 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 @@ -241,6 +253,17 @@ else echo "[PASS]" fi +echo "-------------------" +echo "running mremap_test" +echo "-------------------" +./mremap_test +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "-----------------" echo "running thuge-gen" echo "-----------------" diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 9b0912a01777..92b8ec423201 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -55,6 +55,8 @@ #include <setjmp.h> #include <stdbool.h> #include <assert.h> +#include <inttypes.h> +#include <stdint.h> #include "../kselftest.h" @@ -135,6 +137,13 @@ static void usage(void) exit(1); } +#define uffd_error(code, fmt, ...) \ + do { \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ + fprintf(stderr, ": %" PRId64 "\n", (int64_t)(code)); \ + exit(1); \ + } while (0) + static void uffd_stats_reset(struct uffd_stats *uffd_stats, unsigned long n_cpus) { @@ -206,19 +215,19 @@ static int hugetlb_release_pages(char *rel_area) return ret; } - static void hugetlb_allocate_area(void **alloc_area) { void *area_alias = NULL; char **alloc_area_alias; + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, (map_shared ? MAP_SHARED : MAP_PRIVATE) | MAP_HUGETLB, huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); if (*alloc_area == MAP_FAILED) { - fprintf(stderr, "mmap of hugetlbfs file failed\n"); - *alloc_area = NULL; + perror("mmap of hugetlbfs file failed"); + goto fail; } if (map_shared) { @@ -227,14 +236,11 @@ static void hugetlb_allocate_area(void **alloc_area) huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) { - if (munmap(*alloc_area, nr_pages * page_size) < 0) { - perror("hugetlb munmap"); - exit(1); - } - *alloc_area = NULL; - return; + perror("mmap of hugetlb file alias failed"); + goto fail_munmap; } } + if (*alloc_area == area_src) { huge_fd_off0 = *alloc_area; alloc_area_alias = &area_src_alias; @@ -243,6 +249,16 @@ static void hugetlb_allocate_area(void **alloc_area) } if (area_alias) *alloc_area_alias = area_alias; + + return; + +fail_munmap: + if (munmap(*alloc_area, nr_pages * page_size) < 0) { + perror("hugetlb munmap"); + exit(1); + } +fail: + *alloc_area = NULL; } static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) @@ -331,7 +347,7 @@ static int my_bcmp(char *str1, char *str2, size_t n) static void wp_range(int ufd, __u64 start, __u64 len, bool wp) { - struct uffdio_writeprotect prms = { 0 }; + struct uffdio_writeprotect prms; /* Write protection page faults */ prms.range.start = start; @@ -340,7 +356,8 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp) prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) { - fprintf(stderr, "clear WP failed for address 0x%Lx\n", start); + fprintf(stderr, "clear WP failed for address 0x%" PRIx64 "\n", + (uint64_t)start); exit(1); } } @@ -474,14 +491,11 @@ static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { /* real retval in ufdio_copy.copy */ if (uffdio_copy->copy != -EEXIST) { - fprintf(stderr, "UFFDIO_COPY retry error %Ld\n", - uffdio_copy->copy); - exit(1); + uffd_error(uffdio_copy->copy, + "UFFDIO_COPY retry error"); } - } else { - fprintf(stderr, "UFFDIO_COPY retry unexpected %Ld\n", - uffdio_copy->copy); exit(1); - } + } else + uffd_error(uffdio_copy->copy, "UFFDIO_COPY retry unexpected"); } static int __copy_page(int ufd, unsigned long offset, bool retry) @@ -502,14 +516,10 @@ static int __copy_page(int ufd, unsigned long offset, bool retry) uffdio_copy.copy = 0; if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { /* real retval in ufdio_copy.copy */ - if (uffdio_copy.copy != -EEXIST) { - fprintf(stderr, "UFFDIO_COPY error %Ld\n", - uffdio_copy.copy); - exit(1); - } + if (uffdio_copy.copy != -EEXIST) + uffd_error(uffdio_copy.copy, "UFFDIO_COPY error"); } else if (uffdio_copy.copy != page_size) { - fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n", - uffdio_copy.copy); exit(1); + uffd_error(uffdio_copy.copy, "UFFDIO_COPY unexpected copy"); } else { if (test_uffdio_copy_eexist && retry) { test_uffdio_copy_eexist = false; @@ -784,11 +794,13 @@ static int userfaultfd_open(int features) uffdio_api.api = UFFD_API; uffdio_api.features = features; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { - fprintf(stderr, "UFFDIO_API\n"); + fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to " + "run with either root or ptrace capability.\n"); return 1; } if (uffdio_api.api != UFFD_API) { - fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api); + fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n", + (uint64_t)uffdio_api.api); return 1; } @@ -894,7 +906,7 @@ static int faulting_process(int signal_test) count_verify[nr]); } /* - * Trigger write protection if there is by writting + * Trigger write protection if there is by writing * the same value back. */ *area_count(area_dst, nr) = count; @@ -922,7 +934,7 @@ static int faulting_process(int signal_test) count_verify[nr]); exit(1); } /* - * Trigger write protection if there is by writting + * Trigger write protection if there is by writing * the same value back. */ *area_count(area_dst, nr) = count; @@ -950,13 +962,12 @@ static void retry_uffdio_zeropage(int ufd, offset); if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { if (uffdio_zeropage->zeropage != -EEXIST) { - fprintf(stderr, "UFFDIO_ZEROPAGE retry error %Ld\n", - uffdio_zeropage->zeropage); - exit(1); + uffd_error(uffdio_zeropage->zeropage, + "UFFDIO_ZEROPAGE retry error"); } } else { - fprintf(stderr, "UFFDIO_ZEROPAGE retry unexpected %Ld\n", - uffdio_zeropage->zeropage); exit(1); + uffd_error(uffdio_zeropage->zeropage, + "UFFDIO_ZEROPAGE retry unexpected"); } } @@ -965,6 +976,7 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) struct uffdio_zeropage uffdio_zeropage; int ret; unsigned long has_zeropage; + __s64 res; has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE); @@ -976,29 +988,17 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) uffdio_zeropage.range.len = page_size; uffdio_zeropage.mode = 0; ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); + res = uffdio_zeropage.zeropage; if (ret) { /* real retval in ufdio_zeropage.zeropage */ if (has_zeropage) { - if (uffdio_zeropage.zeropage == -EEXIST) { - fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"); - exit(1); - } else { - fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n", - uffdio_zeropage.zeropage); - exit(1); - } - } else { - if (uffdio_zeropage.zeropage != -EINVAL) { - fprintf(stderr, - "UFFDIO_ZEROPAGE not -EINVAL %Ld\n", - uffdio_zeropage.zeropage); - exit(1); - } - } + uffd_error(res, "UFFDIO_ZEROPAGE %s", + res == -EEXIST ? "-EEXIST" : "error"); + } else if (res != -EINVAL) + uffd_error(res, "UFFDIO_ZEROPAGE not -EINVAL"); } else if (has_zeropage) { - if (uffdio_zeropage.zeropage != page_size) { - fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n", - uffdio_zeropage.zeropage); exit(1); + if (res != page_size) { + uffd_error(res, "UFFDIO_ZEROPAGE unexpected"); } else { if (test_uffdio_zeropage_eexist && retry) { test_uffdio_zeropage_eexist = false; @@ -1007,11 +1007,8 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) } return 1; } - } else { - fprintf(stderr, - "UFFDIO_ZEROPAGE succeeded %Ld\n", - uffdio_zeropage.zeropage); exit(1); - } + } else + uffd_error(res, "UFFDIO_ZEROPAGE succeeded"); return 0; } @@ -1033,7 +1030,7 @@ static int userfaultfd_zeropage_test(void) if (uffd_test_ops->release_pages(area_dst)) return 1; - if (userfaultfd_open(0) < 0) + if (userfaultfd_open(0)) return 1; uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; @@ -1083,7 +1080,7 @@ static int userfaultfd_events_test(void) features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE; - if (userfaultfd_open(features) < 0) + if (userfaultfd_open(features)) return 1; fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); @@ -1155,7 +1152,7 @@ static int userfaultfd_sig_test(void) return 1; features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; - if (userfaultfd_open(features) < 0) + if (userfaultfd_open(features)) return 1; fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); @@ -1235,7 +1232,7 @@ static int userfaultfd_stress(void) if (!area_dst) return 1; - if (userfaultfd_open(0) < 0) + if (userfaultfd_open(0)) return 1; count_verify = malloc(nr_pages * sizeof(unsigned long long)); @@ -1295,6 +1292,8 @@ static int userfaultfd_stress(void) printf(" ver"); if (bounces & BOUNCE_POLL) printf(" poll"); + else + printf(" read"); printf(", "); fflush(stdout); diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 6703c7906b71..333980375bc7 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl ioperm \ - test_vdso test_vsyscall mov_ss_trap \ + test_vsyscall mov_ss_trap \ syscall_arg_fault fsgsbase_restore TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 7161cfc2e60b..8c780cce941d 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -392,8 +392,8 @@ static void set_gs_and_switch_to(unsigned long local, local = read_base(GS); /* - * Signal delivery seems to mess up weird selectors. Put it - * back. + * Signal delivery is quite likely to change a selector + * of 1, 2, or 3 back to 0 due to IRET being defective. */ asm volatile ("mov %0, %%gs" : : "rm" (force_sel)); } else { @@ -411,6 +411,14 @@ static void set_gs_and_switch_to(unsigned long local, if (base == local && sel_pre_sched == sel_post_sched) { printf("[OK]\tGS/BASE remained 0x%hx/0x%lx\n", sel_pre_sched, local); + } else if (base == local && sel_pre_sched >= 1 && sel_pre_sched <= 3 && + sel_post_sched == 0) { + /* + * IRET is misdesigned and will squash selectors 1, 2, or 3 + * to zero. Don't fail the test just because this happened. + */ + printf("[OK]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx because IRET is defective\n", + sel_pre_sched, local, sel_post_sched, base); } else { nerrs++; printf("[FAIL]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx\n", diff --git a/tools/testing/selftests/x86/raw_syscall_helper_32.S b/tools/testing/selftests/x86/raw_syscall_helper_32.S index 94410fa2b5ed..a10d36afdca0 100644 --- a/tools/testing/selftests/x86/raw_syscall_helper_32.S +++ b/tools/testing/selftests/x86/raw_syscall_helper_32.S @@ -45,3 +45,5 @@ int80_and_ret: .type int80_and_ret, @function .size int80_and_ret, .-int80_and_ret + +.section .note.GNU-stack,"",%progbits diff --git a/tools/testing/selftests/x86/thunks.S b/tools/testing/selftests/x86/thunks.S index 1bb5d62c16a4..a2d47d8344d4 100644 --- a/tools/testing/selftests/x86/thunks.S +++ b/tools/testing/selftests/x86/thunks.S @@ -57,3 +57,5 @@ call32_from_64: ret .size call32_from_64, .-call32_from_64 + +.section .note.GNU-stack,"",%progbits |