From d279f1f8c64711ca986c3121c8ec811b892932f0 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 11 Dec 2017 11:39:03 -0800
Subject: bpf/tracing: add a bpf test for new ioctl query interface

Added a subtest in test_progs. The tracepoint is
sched/sched_switch. Multiple bpf programs are attached to
this tracepoint and the query interface is exercised.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/perf_event.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'tools/include')

diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 362493a2f950..f2c354d5f519 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -418,6 +418,27 @@ struct perf_event_attr {
 	__u16	__reserved_2;	/* align to __u64 */
 };
 
+/*
+ * Structure used by below PERF_EVENT_IOC_QUERY_BPF command
+ * to query bpf programs attached to the same perf tracepoint
+ * as the given perf event.
+ */
+struct perf_event_query_bpf {
+	/*
+	 * The below ids array length
+	 */
+	__u32	ids_len;
+	/*
+	 * Set by the kernel to indicate the number of
+	 * available programs
+	 */
+	__u32	prog_cnt;
+	/*
+	 * User provided buffer to store program ids
+	 */
+	__u32	ids[0];
+};
+
 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
 
 /*
@@ -433,6 +454,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
 #define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 #define PERF_EVENT_IOC_PAUSE_OUTPUT	_IOW('$', 9, __u32)
+#define PERF_EVENT_IOC_QUERY_BPF	_IOWR('$', 10, struct perf_event_query_bpf *)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
-- 
cgit v1.2.3


From 965de87e54b803223bff703ea6b2a76c056695ae Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 11 Dec 2017 11:36:49 -0500
Subject: samples/bpf: add a test for bpf_override_return

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile                      |  4 ++++
 samples/bpf/test_override_return.sh       | 15 +++++++++++++++
 samples/bpf/tracex7_kern.c                | 16 ++++++++++++++++
 samples/bpf/tracex7_user.c                | 28 ++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            |  7 ++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

(limited to 'tools/include')

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index adeaa1302f34..4fb944a7ecf8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -101,6 +103,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -155,6 +158,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
new file mode 100755
index 000000000000..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+	echo "SUCCESS!"
+else
+	echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index 000000000000..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	unsigned long rc = -12;
+
+	bpf_override_return(ctx, rc);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index 000000000000..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+	FILE *f;
+	char filename[256];
+	char command[256];
+	int ret;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+	f = popen(command, "r");
+	ret = pclose(f);
+
+	return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4c223ab30293..cf446c25c0ec 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index fd9a17fa8a8b..33cb00e46c49 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -82,7 +82,8 @@ static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
 static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
 				       unsigned int buf_size) =
 	(void *) BPF_FUNC_perf_prog_read_value;
-
+static int (*bpf_override_return)(void *ctx, unsigned long rc) =
+	(void *) BPF_FUNC_override_return;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3


From 48cca7e44f9f8268fdcd4351e2f19ff2275119d1 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 14 Dec 2017 17:55:10 -0800
Subject: libbpf: add support for bpf_call

- recognize relocation emitted by llvm
- since all regular function will be kept in .text section and llvm
  takes care of pc-relative offsets in bpf_call instruction
  simply copy all of .text to relevant program section while adjusting
  bpf_call instructions in program section to point to newly copied
  body of instructions from .text
- do so for all programs in the elf file
- set all programs types to the one passed to bpf_prog_load()

Note for elf files with multiple programs that use different
functions in .text section we need to do 'linker' style logic.
This work is still TBD

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h |   6 ++
 tools/lib/bpf/bpf.h            |   2 +-
 tools/lib/bpf/libbpf.c         | 170 ++++++++++++++++++++++++++++++-----------
 3 files changed, 134 insertions(+), 44 deletions(-)

(limited to 'tools/include')

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cf446c25c0ec..db1b0923a308 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -197,8 +197,14 @@ enum bpf_attach_type {
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
+/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
 #define BPF_PSEUDO_MAP_FD	1
 
+/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
+ * offset to another bpf function
+ */
+#define BPF_PSEUDO_CALL		1
+
 /* flags for BPF_MAP_UPDATE_ELEM command */
 #define BPF_ANY		0 /* create new element or update existing */
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 6534889e2b2f..9f44c196931e 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -40,7 +40,7 @@ int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name,
 			  __u32 map_flags);
 
 /* Recommend log buffer size */
-#define BPF_LOG_BUF_SIZE 65536
+#define BPF_LOG_BUF_SIZE (256 * 1024)
 int bpf_load_program_name(enum bpf_prog_type type, const char *name,
 			  const struct bpf_insn *insns,
 			  size_t insns_cnt, const char *license,
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 65d0d0aff4fa..5b83875b3594 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -174,12 +174,19 @@ struct bpf_program {
 	char *name;
 	char *section_name;
 	struct bpf_insn *insns;
-	size_t insns_cnt;
+	size_t insns_cnt, main_prog_cnt;
 	enum bpf_prog_type type;
 
-	struct {
+	struct reloc_desc {
+		enum {
+			RELO_LD64,
+			RELO_CALL,
+		} type;
 		int insn_idx;
-		int map_idx;
+		union {
+			int map_idx;
+			int text_off;
+		};
 	} *reloc_desc;
 	int nr_reloc;
 
@@ -234,6 +241,7 @@ struct bpf_object {
 		} *reloc;
 		int nr_reloc;
 		int maps_shndx;
+		int text_shndx;
 	} efile;
 	/*
 	 * All loaded bpf_object is linked in a list, which is
@@ -375,9 +383,13 @@ bpf_object__init_prog_names(struct bpf_object *obj)
 	size_t pi, si;
 
 	for (pi = 0; pi < obj->nr_programs; pi++) {
-		char *name = NULL;
+		const char *name = NULL;
 
 		prog = &obj->programs[pi];
+		if (prog->idx == obj->efile.text_shndx) {
+			name = ".text";
+			goto skip_search;
+		}
 
 		for (si = 0; si < symbols->d_size / sizeof(GElf_Sym) && !name;
 		     si++) {
@@ -405,7 +417,7 @@ bpf_object__init_prog_names(struct bpf_object *obj)
 				   prog->section_name);
 			return -EINVAL;
 		}
-
+skip_search:
 		prog->name = strdup(name);
 		if (!prog->name) {
 			pr_warning("failed to allocate memory for prog sym %s\n",
@@ -795,6 +807,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 		} else if ((sh.sh_type == SHT_PROGBITS) &&
 			   (sh.sh_flags & SHF_EXECINSTR) &&
 			   (data->d_size > 0)) {
+			if (strcmp(name, ".text") == 0)
+				obj->efile.text_shndx = idx;
 			err = bpf_object__add_program(obj, data->d_buf,
 						      data->d_size, name, idx);
 			if (err) {
@@ -856,11 +870,14 @@ bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx)
 }
 
 static int
-bpf_program__collect_reloc(struct bpf_program *prog,
-			   size_t nr_maps, GElf_Shdr *shdr,
-			   Elf_Data *data, Elf_Data *symbols,
-			   int maps_shndx, struct bpf_map *maps)
+bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr,
+			   Elf_Data *data, struct bpf_object *obj)
 {
+	Elf_Data *symbols = obj->efile.symbols;
+	int text_shndx = obj->efile.text_shndx;
+	int maps_shndx = obj->efile.maps_shndx;
+	struct bpf_map *maps = obj->maps;
+	size_t nr_maps = obj->nr_maps;
 	int i, nrels;
 
 	pr_debug("collecting relocating info for: '%s'\n",
@@ -893,8 +910,10 @@ bpf_program__collect_reloc(struct bpf_program *prog,
 				   GELF_R_SYM(rel.r_info));
 			return -LIBBPF_ERRNO__FORMAT;
 		}
+		pr_debug("relo for %ld value %ld name %d\n",
+			 rel.r_info >> 32, sym.st_value, sym.st_name);
 
-		if (sym.st_shndx != maps_shndx) {
+		if (sym.st_shndx != maps_shndx && sym.st_shndx != text_shndx) {
 			pr_warning("Program '%s' contains non-map related relo data pointing to section %u\n",
 				   prog->section_name, sym.st_shndx);
 			return -LIBBPF_ERRNO__RELOC;
@@ -903,6 +922,17 @@ bpf_program__collect_reloc(struct bpf_program *prog,
 		insn_idx = rel.r_offset / sizeof(struct bpf_insn);
 		pr_debug("relocation: insn_idx=%u\n", insn_idx);
 
+		if (insns[insn_idx].code == (BPF_JMP | BPF_CALL)) {
+			if (insns[insn_idx].src_reg != BPF_PSEUDO_CALL) {
+				pr_warning("incorrect bpf_call opcode\n");
+				return -LIBBPF_ERRNO__RELOC;
+			}
+			prog->reloc_desc[i].type = RELO_CALL;
+			prog->reloc_desc[i].insn_idx = insn_idx;
+			prog->reloc_desc[i].text_off = sym.st_value;
+			continue;
+		}
+
 		if (insns[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
 			pr_warning("bpf: relocation: invalid relo for insns[%d].code 0x%x\n",
 				   insn_idx, insns[insn_idx].code);
@@ -924,6 +954,7 @@ bpf_program__collect_reloc(struct bpf_program *prog,
 			return -LIBBPF_ERRNO__RELOC;
 		}
 
+		prog->reloc_desc[i].type = RELO_LD64;
 		prog->reloc_desc[i].insn_idx = insn_idx;
 		prog->reloc_desc[i].map_idx = map_idx;
 	}
@@ -962,28 +993,77 @@ bpf_object__create_maps(struct bpf_object *obj)
 	return 0;
 }
 
+static int
+bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj,
+			struct reloc_desc *relo)
+{
+	struct bpf_insn *insn, *new_insn;
+	struct bpf_program *text;
+	size_t new_cnt;
+
+	if (relo->type != RELO_CALL)
+		return -LIBBPF_ERRNO__RELOC;
+
+	if (prog->idx == obj->efile.text_shndx) {
+		pr_warning("relo in .text insn %d into off %d\n",
+			   relo->insn_idx, relo->text_off);
+		return -LIBBPF_ERRNO__RELOC;
+	}
+
+	if (prog->main_prog_cnt == 0) {
+		text = bpf_object__find_prog_by_idx(obj, obj->efile.text_shndx);
+		if (!text) {
+			pr_warning("no .text section found yet relo into text exist\n");
+			return -LIBBPF_ERRNO__RELOC;
+		}
+		new_cnt = prog->insns_cnt + text->insns_cnt;
+		new_insn = realloc(prog->insns, new_cnt * sizeof(*insn));
+		if (!new_insn) {
+			pr_warning("oom in prog realloc\n");
+			return -ENOMEM;
+		}
+		memcpy(new_insn + prog->insns_cnt, text->insns,
+		       text->insns_cnt * sizeof(*insn));
+		prog->insns = new_insn;
+		prog->main_prog_cnt = prog->insns_cnt;
+		prog->insns_cnt = new_cnt;
+	}
+	insn = &prog->insns[relo->insn_idx];
+	insn->imm += prog->main_prog_cnt - relo->insn_idx;
+	pr_debug("added %zd insn from %s to prog %s\n",
+		 text->insns_cnt, text->section_name, prog->section_name);
+	return 0;
+}
+
 static int
 bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj)
 {
-	int i;
+	int i, err;
 
 	if (!prog || !prog->reloc_desc)
 		return 0;
 
 	for (i = 0; i < prog->nr_reloc; i++) {
-		int insn_idx, map_idx;
-		struct bpf_insn *insns = prog->insns;
+		if (prog->reloc_desc[i].type == RELO_LD64) {
+			struct bpf_insn *insns = prog->insns;
+			int insn_idx, map_idx;
 
-		insn_idx = prog->reloc_desc[i].insn_idx;
-		map_idx = prog->reloc_desc[i].map_idx;
+			insn_idx = prog->reloc_desc[i].insn_idx;
+			map_idx = prog->reloc_desc[i].map_idx;
 
-		if (insn_idx >= (int)prog->insns_cnt) {
-			pr_warning("relocation out of range: '%s'\n",
-				   prog->section_name);
-			return -LIBBPF_ERRNO__RELOC;
+			if (insn_idx >= (int)prog->insns_cnt) {
+				pr_warning("relocation out of range: '%s'\n",
+					   prog->section_name);
+				return -LIBBPF_ERRNO__RELOC;
+			}
+			insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
+			insns[insn_idx].imm = obj->maps[map_idx].fd;
+		} else {
+			err = bpf_program__reloc_text(prog, obj,
+						      &prog->reloc_desc[i]);
+			if (err)
+				return err;
 		}
-		insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
-		insns[insn_idx].imm = obj->maps[map_idx].fd;
 	}
 
 	zfree(&prog->reloc_desc);
@@ -1026,7 +1106,6 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 		Elf_Data *data = obj->efile.reloc[i].data;
 		int idx = shdr->sh_info;
 		struct bpf_program *prog;
-		size_t nr_maps = obj->nr_maps;
 
 		if (shdr->sh_type != SHT_REL) {
 			pr_warning("internal error at %d\n", __LINE__);
@@ -1040,11 +1119,9 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 			return -LIBBPF_ERRNO__RELOC;
 		}
 
-		err = bpf_program__collect_reloc(prog, nr_maps,
+		err = bpf_program__collect_reloc(prog,
 						 shdr, data,
-						 obj->efile.symbols,
-						 obj->efile.maps_shndx,
-						 obj->maps);
+						 obj);
 		if (err)
 			return err;
 	}
@@ -1197,6 +1274,8 @@ bpf_object__load_progs(struct bpf_object *obj)
 	int err;
 
 	for (i = 0; i < obj->nr_programs; i++) {
+		if (obj->programs[i].idx == obj->efile.text_shndx)
+			continue;
 		err = bpf_program__load(&obj->programs[i],
 					obj->license,
 					obj->kern_version);
@@ -1859,7 +1938,7 @@ long libbpf_get_error(const void *ptr)
 int bpf_prog_load(const char *file, enum bpf_prog_type type,
 		  struct bpf_object **pobj, int *prog_fd)
 {
-	struct bpf_program *prog;
+	struct bpf_program *prog, *first_prog = NULL;
 	struct bpf_object *obj;
 	int err;
 
@@ -1867,25 +1946,30 @@ int bpf_prog_load(const char *file, enum bpf_prog_type type,
 	if (IS_ERR(obj))
 		return -ENOENT;
 
-	prog = bpf_program__next(NULL, obj);
-	if (!prog) {
-		bpf_object__close(obj);
-		return -ENOENT;
-	}
-
-	/*
-	 * If type is not specified, try to guess it based on
-	 * section name.
-	 */
-	if (type == BPF_PROG_TYPE_UNSPEC) {
-		type = bpf_program__guess_type(prog);
+	bpf_object__for_each_program(prog, obj) {
+		/*
+		 * If type is not specified, try to guess it based on
+		 * section name.
+		 */
 		if (type == BPF_PROG_TYPE_UNSPEC) {
-			bpf_object__close(obj);
-			return -EINVAL;
+			type = bpf_program__guess_type(prog);
+			if (type == BPF_PROG_TYPE_UNSPEC) {
+				bpf_object__close(obj);
+				return -EINVAL;
+			}
 		}
+
+		bpf_program__set_type(prog, type);
+		if (prog->idx != obj->efile.text_shndx && !first_prog)
+			first_prog = prog;
+	}
+
+	if (!first_prog) {
+		pr_warning("object file doesn't contain bpf program\n");
+		bpf_object__close(obj);
+		return -ENOENT;
 	}
 
-	bpf_program__set_type(prog, type);
 	err = bpf_object__load(obj);
 	if (err) {
 		bpf_object__close(obj);
@@ -1893,6 +1977,6 @@ int bpf_prog_load(const char *file, enum bpf_prog_type type,
 	}
 
 	*pobj = obj;
-	*prog_fd = bpf_program__fd(prog);
+	*prog_fd = bpf_program__fd(first_prog);
 	return 0;
 }
-- 
cgit v1.2.3


From 675fc275a3a2d905535207237402c6d8dcb5fa4b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:09 -0800
Subject: bpf: offload: report device information for offloaded programs

Report to the user ifindex and namespace information of offloaded
programs.  If device has disappeared return -ENODEV.  Specify the
namespace using dev/inode combination.

CC: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h            |  2 ++
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/offload.c           | 59 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  6 +++++
 tools/include/uapi/linux/bpf.h |  3 +++
 5 files changed, 73 insertions(+)

(limited to 'tools/include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9a916ab34299..7810ae57b357 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -531,6 +531,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+			       struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 69eabfcb9bdb..f2f8b36e2ad4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -921,6 +921,9 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index e4f1668a021c..040d4e0edf3f 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -16,9 +16,11 @@
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bug.h>
+#include <linux/kdev_t.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
+#include <linux/proc_ns.h>
 #include <linux/rtnetlink.h>
 #include <linux/rwsem.h>
 
@@ -176,6 +178,63 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
+struct ns_get_path_bpf_prog_args {
+	struct bpf_prog *prog;
+	struct bpf_prog_info *info;
+};
+
+static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data)
+{
+	struct ns_get_path_bpf_prog_args *args = private_data;
+	struct bpf_prog_aux *aux = args->prog->aux;
+	struct ns_common *ns;
+	struct net *net;
+
+	rtnl_lock();
+	down_read(&bpf_devs_lock);
+
+	if (aux->offload) {
+		args->info->ifindex = aux->offload->netdev->ifindex;
+		net = dev_net(aux->offload->netdev);
+		get_net(net);
+		ns = &net->ns;
+	} else {
+		args->info->ifindex = 0;
+		ns = NULL;
+	}
+
+	up_read(&bpf_devs_lock);
+	rtnl_unlock();
+
+	return ns;
+}
+
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+			       struct bpf_prog *prog)
+{
+	struct ns_get_path_bpf_prog_args args = {
+		.prog	= prog,
+		.info	= info,
+	};
+	struct inode *ns_inode;
+	struct path ns_path;
+	void *res;
+
+	res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
+	if (IS_ERR(res)) {
+		if (!info->ifindex)
+			return -ENODEV;
+		return PTR_ERR(res);
+	}
+
+	ns_inode = ns_path.dentry->d_inode;
+	info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+	info->netns_ino = ns_inode->i_ino;
+	path_put(&ns_path);
+
+	return 0;
+}
+
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e02dafa6f402..ebf0fb23e237 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1707,6 +1707,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
+	if (bpf_prog_is_dev_bound(prog->aux)) {
+		err = bpf_prog_offload_info_fill(&info, prog);
+		if (err)
+			return err;
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index db1b0923a308..4e8c60acfa32 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -921,6 +921,9 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
cgit v1.2.3


From a38845729ea3985db5d2544ec3ef3dc8f6313a27 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 11 Jan 2018 20:29:09 -0800
Subject: bpf: offload: add map offload infrastructure

BPF map offload follow similar path to program offload.  At creation
time users may specify ifindex of the device on which they want to
create the map.  Map will be validated by the kernel's
.map_alloc_check callback and device driver will be called for the
actual allocation.  Map will have an empty set of operations
associated with it (save for alloc and free callbacks).  The real
device callbacks are kept in map->offload->dev_ops because they
have slightly different signatures.  Map operations are called in
process context so the driver may communicate with HW freely,
msleep(), wait() etc.

Map alloc and free callbacks are muxed via existing .ndo_bpf, and
are always called with rtnl lock held.  Maps and programs are
guaranteed to be destroyed before .ndo_uninit (i.e. before
unregister_netdev() returns).  Map callbacks are invoked with
bpf_devs_lock *read* locked, drivers must take care of exclusive
locking if necessary.

All offload-specific branches are marked with unlikely() (through
bpf_map_is_dev_bound()), given that branch penalty will be
negligible compared to IO anyway, and we don't want to penalize
SW path unnecessarily.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h            |  59 +++++++++++++
 include/linux/netdevice.h      |   6 ++
 include/uapi/linux/bpf.h       |   1 +
 kernel/bpf/offload.c           | 188 +++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/syscall.c           |  44 ++++++++--
 kernel/bpf/verifier.c          |   7 ++
 tools/include/uapi/linux/bpf.h |   1 +
 7 files changed, 293 insertions(+), 13 deletions(-)

(limited to 'tools/include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9fff1ace1d8e..5c2c104dc2c5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -74,6 +74,33 @@ struct bpf_map {
 	char name[BPF_OBJ_NAME_LEN];
 };
 
+struct bpf_offloaded_map;
+
+struct bpf_map_dev_ops {
+	int (*map_get_next_key)(struct bpf_offloaded_map *map,
+				void *key, void *next_key);
+	int (*map_lookup_elem)(struct bpf_offloaded_map *map,
+			       void *key, void *value);
+	int (*map_update_elem)(struct bpf_offloaded_map *map,
+			       void *key, void *value, u64 flags);
+	int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key);
+};
+
+struct bpf_offloaded_map {
+	struct bpf_map map;
+	struct net_device *netdev;
+	const struct bpf_map_dev_ops *dev_ops;
+	void *dev_priv;
+	struct list_head offloads;
+};
+
+static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
+{
+	return container_of(map, struct bpf_offloaded_map, map);
+}
+
+extern const struct bpf_map_ops bpf_map_offload_ops;
+
 /* function argument constraints */
 enum bpf_arg_type {
 	ARG_DONTCARE = 0,	/* unused argument in helper function */
@@ -369,6 +396,7 @@ int __bpf_prog_charge(struct user_struct *user, u32 pages);
 void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
 
 void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
@@ -556,6 +584,15 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog);
 int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 			       struct bpf_prog *prog);
 
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
+int bpf_map_offload_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags);
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key);
+int bpf_map_offload_get_next_key(struct bpf_map *map,
+				 void *key, void *next_key);
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map);
+
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
 
@@ -563,6 +600,14 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
 {
 	return aux->offload_requested;
 }
+
+static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
+{
+	return unlikely(map->ops == &bpf_map_offload_ops);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
+void bpf_map_offload_map_free(struct bpf_map *map);
 #else
 static inline int bpf_prog_offload_init(struct bpf_prog *prog,
 					union bpf_attr *attr)
@@ -574,6 +619,20 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
 {
 	return false;
 }
+
+static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
+{
+	return false;
+}
+
+static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void bpf_map_offload_map_free(struct bpf_map *map)
+{
+}
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ef7b348e8498..0b3ab42d50fe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -804,6 +804,8 @@ enum bpf_netdev_command {
 	BPF_OFFLOAD_VERIFIER_PREP,
 	BPF_OFFLOAD_TRANSLATE,
 	BPF_OFFLOAD_DESTROY,
+	BPF_OFFLOAD_MAP_ALLOC,
+	BPF_OFFLOAD_MAP_FREE,
 };
 
 struct bpf_prog_offload_ops;
@@ -834,6 +836,10 @@ struct netdev_bpf {
 		struct {
 			struct bpf_prog *prog;
 		} offload;
+		/* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
+		struct {
+			struct bpf_offloaded_map *offmap;
+		};
 	};
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 395d261948de..7c2259e8bc54 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -245,6 +245,7 @@ union bpf_attr {
 					 * BPF_F_NUMA_NODE is set).
 					 */
 		char	map_name[BPF_OBJ_NAME_LEN];
+		__u32	map_ifindex;	/* ifindex of netdev to create on */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index cdd1e19a668b..453785fa1881 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -24,11 +24,13 @@
 #include <linux/rtnetlink.h>
 #include <linux/rwsem.h>
 
-/* Protects bpf_prog_offload_devs and offload members of all progs.
+/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
+ * of all progs.
  * RTNL lock cannot be taken when holding this lock.
  */
 static DECLARE_RWSEM(bpf_devs_lock);
 static LIST_HEAD(bpf_prog_offload_devs);
+static LIST_HEAD(bpf_map_offload_devs);
 
 static int bpf_dev_offload_check(struct net_device *netdev)
 {
@@ -250,11 +252,186 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+			       enum bpf_netdev_command cmd)
+{
+	struct netdev_bpf data = {};
+	struct net_device *netdev;
+
+	ASSERT_RTNL();
+
+	data.command = cmd;
+	data.offmap = offmap;
+	/* Caller must make sure netdev is valid */
+	netdev = offmap->netdev;
+
+	return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_offloaded_map *offmap;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+	if (attr->map_type != BPF_MAP_TYPE_HASH)
+		return ERR_PTR(-EINVAL);
+
+	offmap = kzalloc(sizeof(*offmap), GFP_USER);
+	if (!offmap)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&offmap->map, attr);
+
+	rtnl_lock();
+	down_write(&bpf_devs_lock);
+	offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
+	err = bpf_dev_offload_check(offmap->netdev);
+	if (err)
+		goto err_unlock;
+
+	err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
+	if (err)
+		goto err_unlock;
+
+	list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
+	up_write(&bpf_devs_lock);
+	rtnl_unlock();
+
+	return &offmap->map;
+
+err_unlock:
+	up_write(&bpf_devs_lock);
+	rtnl_unlock();
+	kfree(offmap);
+	return ERR_PTR(err);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+	WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+	/* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+	bpf_map_free_id(&offmap->map, true);
+	list_del_init(&offmap->offloads);
+	offmap->netdev = NULL;
+}
+
+void bpf_map_offload_map_free(struct bpf_map *map)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+
+	rtnl_lock();
+	down_write(&bpf_devs_lock);
+	if (offmap->netdev)
+		__bpf_map_offload_destroy(offmap);
+	up_write(&bpf_devs_lock);
+	rtnl_unlock();
+
+	kfree(offmap);
+}
+
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+int bpf_map_offload_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_update_elem(offmap, key, value,
+						       flags);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_delete_elem(offmap, key);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
+{
+	struct bpf_offloaded_map *offmap;
+	struct bpf_prog_offload *offload;
+	bool ret;
+
+	if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map))
+		return false;
+	if (!bpf_prog_is_dev_bound(prog->aux))
+		return true;
+
+	down_read(&bpf_devs_lock);
+	offload = prog->aux->offload;
+	offmap = map_to_offmap(map);
+
+	ret = offload && offload->netdev == offmap->netdev;
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+static void bpf_offload_orphan_all_progs(struct net_device *netdev)
+{
+	struct bpf_prog_offload *offload, *tmp;
+
+	list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
+		if (offload->netdev == netdev)
+			__bpf_prog_offload_destroy(offload->prog);
+}
+
+static void bpf_offload_orphan_all_maps(struct net_device *netdev)
+{
+	struct bpf_offloaded_map *offmap, *tmp;
+
+	list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
+		if (offmap->netdev == netdev)
+			__bpf_map_offload_destroy(offmap);
+}
+
 static int bpf_offload_notification(struct notifier_block *notifier,
 				    ulong event, void *ptr)
 {
 	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
-	struct bpf_prog_offload *offload, *tmp;
 
 	ASSERT_RTNL();
 
@@ -265,11 +442,8 @@ static int bpf_offload_notification(struct notifier_block *notifier,
 			break;
 
 		down_write(&bpf_devs_lock);
-		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
-					 offloads) {
-			if (offload->netdev == netdev)
-				__bpf_prog_offload_destroy(offload->prog);
-		}
+		bpf_offload_orphan_all_progs(netdev);
+		bpf_offload_orphan_all_maps(netdev);
 		up_write(&bpf_devs_lock);
 		break;
 	default:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a3f726bb42ea..c691b9e972e3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -94,6 +94,11 @@ static int check_uarg_tail_zero(void __user *uaddr,
 	return 0;
 }
 
+const struct bpf_map_ops bpf_map_offload_ops = {
+	.map_alloc = bpf_map_offload_map_alloc,
+	.map_free = bpf_map_offload_map_free,
+};
+
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 {
 	const struct bpf_map_ops *ops;
@@ -111,6 +116,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 		if (err)
 			return ERR_PTR(err);
 	}
+	if (attr->map_ifindex)
+		ops = &bpf_map_offload_ops;
 	map = ops->map_alloc(attr);
 	if (IS_ERR(map))
 		return map;
@@ -208,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 {
 	unsigned long flags;
 
+	/* Offloaded maps are removed from the IDR store when their device
+	 * disappears - even if someone holds an fd to them they are unusable,
+	 * the memory is gone, all ops will fail; they are simply waiting for
+	 * refcnt to drop to be freed.
+	 */
+	if (!map->id)
+		return;
+
 	if (do_idr_lock)
 		spin_lock_irqsave(&map_idr_lock, flags);
 	else
 		__acquire(&map_idr_lock);
 
 	idr_remove(&map_idr, map->id);
+	map->id = 0;
 
 	if (do_idr_lock)
 		spin_unlock_irqrestore(&map_idr_lock, flags);
@@ -397,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_name
+#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -585,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (!value)
 		goto free_key;
 
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
@@ -673,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr)
 		goto free_value;
 
 	/* Need to create a kthread, thus must support schedule */
-	if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_update_elem(map, key, value, attr->flags);
+		goto out;
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
 		goto out;
 	}
@@ -750,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_delete_elem(map, key);
+		goto out;
+	}
+
 	preempt_disable();
 	__this_cpu_inc(bpf_prog_active);
 	rcu_read_lock();
@@ -757,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr)
 	rcu_read_unlock();
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
-
+out:
 	if (!err)
 		trace_bpf_map_delete_elem(map, ufd, key);
 	kfree(key);
@@ -807,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (!next_key)
 		goto free_key;
 
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_get_next_key(map, key, next_key);
+		goto out;
+	}
+
 	rcu_read_lock();
 	err = map->ops->map_get_next_key(map, key, next_key);
 	rcu_read_unlock();
+out:
 	if (err)
 		goto free_next_key;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 48b61caa94cb..ceabb394d2dc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4816,6 +4816,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 	}
+
+	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+	    !bpf_offload_dev_match(prog, map)) {
+		verbose(env, "offload device mismatch between prog and map\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4e8c60acfa32..69f96af4a569 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -245,6 +245,7 @@ union bpf_attr {
 					 * BPF_F_NUMA_NODE is set).
 					 */
 		char	map_name[BPF_OBJ_NAME_LEN];
+		__u32	map_ifindex;	/* ifindex of netdev to create on */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
-- 
cgit v1.2.3


From e7b2823a582a5bca5ee47644f448e317178e8824 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 18 Jan 2018 17:49:08 +0100
Subject: bpf: Sync kernel ABI header with tooling header

Update tools/include/uapi/linux/bpf.h to bring it in sync with
include/uapi/linux/bpf.h.  The listed commits forgot to update it.

Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs")
Fixes: f19397a5c656 ("bpf: Add access to snd_cwnd and others in sock_ops")
Fixes: 06ef0ccb5a36 ("bpf/cgroup: fix a verification error for a CGROUP_DEVICE type prog")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'tools/include')

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 69f96af4a569..7c2259e8bc54 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -900,6 +900,9 @@ struct xdp_md {
 	__u32 data;
 	__u32 data_end;
 	__u32 data_meta;
+	/* Below access go through struct xdp_rxq_info */
+	__u32 ingress_ifindex; /* rxq->dev->ifindex */
+	__u32 rx_queue_index;  /* rxq->queue_index  */
 };
 
 enum sk_action {
@@ -956,6 +959,12 @@ struct bpf_sock_ops {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 is_fullsock;	/* Some TCP fields are only valid if
+				 * there is a full socket. If not, the
+				 * fields read as zero.
+				 */
+	__u32 snd_cwnd;
+	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
 };
 
 /* List of known BPF sock_ops operators.
@@ -1010,7 +1019,8 @@ struct bpf_perf_event_value {
 #define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
 
 struct bpf_cgroup_dev_ctx {
-	__u32 access_type; /* (access << 16) | type */
+	/* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */
+	__u32 access_type;
 	__u32 major;
 	__u32 minor;
 };
-- 
cgit v1.2.3


From 52775b33bb5072fbc07b02c0cf4fe8da1f7ee7cd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Jan 2018 19:13:28 -0800
Subject: bpf: offload: report device information about offloaded maps

Tell user space about device on which the map was created.
Unfortunate reality of user ABI makes sharing this code
with program offload difficult but the information is the
same.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h            |  2 ++
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/offload.c           | 55 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  6 +++++
 tools/include/uapi/linux/bpf.h |  3 +++
 5 files changed, 69 insertions(+)

(limited to 'tools/include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 025b1c2f8053..66df387106de 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -586,6 +586,8 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog);
 int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 			       struct bpf_prog *prog);
 
+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map);
+
 int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
 int bpf_map_offload_update_elem(struct bpf_map *map,
 				void *key, void *value, u64 flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 74dc4dc98681..406c19d6016b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -938,6 +938,9 @@ struct bpf_map_info {
 	__u32 max_entries;
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 2657976aec2a..c9401075b58c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -413,6 +413,61 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return ret;
 }
 
+struct ns_get_path_bpf_map_args {
+	struct bpf_offloaded_map *offmap;
+	struct bpf_map_info *info;
+};
+
+static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data)
+{
+	struct ns_get_path_bpf_map_args *args = private_data;
+	struct ns_common *ns;
+	struct net *net;
+
+	rtnl_lock();
+	down_read(&bpf_devs_lock);
+
+	if (args->offmap->netdev) {
+		args->info->ifindex = args->offmap->netdev->ifindex;
+		net = dev_net(args->offmap->netdev);
+		get_net(net);
+		ns = &net->ns;
+	} else {
+		args->info->ifindex = 0;
+		ns = NULL;
+	}
+
+	up_read(&bpf_devs_lock);
+	rtnl_unlock();
+
+	return ns;
+}
+
+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+	struct ns_get_path_bpf_map_args args = {
+		.offmap	= map_to_offmap(map),
+		.info	= info,
+	};
+	struct inode *ns_inode;
+	struct path ns_path;
+	void *res;
+
+	res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args);
+	if (IS_ERR(res)) {
+		if (!info->ifindex)
+			return -ENODEV;
+		return PTR_ERR(res);
+	}
+
+	ns_inode = ns_path.dentry->d_inode;
+	info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+	info->netns_ino = ns_inode->i_ino;
+	path_put(&ns_path);
+
+	return 0;
+}
+
 bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
 {
 	struct bpf_offloaded_map *offmap;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 97a825ffc763..5bdb0cc84ad2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1801,6 +1801,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.map_flags = map->map_flags;
 	memcpy(info.name, map->name, sizeof(map->name));
 
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_info_fill(&info, map);
+		if (err)
+			return err;
+	}
+
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
 		return -EFAULT;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7c2259e8bc54..af1f49ad8b88 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -938,6 +938,9 @@ struct bpf_map_info {
 	__u32 max_entries;
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
cgit v1.2.3


From d6d4f60c3a0933852dcc40a2142d93027ea1da76 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:16 -0800
Subject: bpf: add selftest for tcpbpf

Added a selftest for tcpbpf (sock_ops) that checks that the appropriate
callbacks occured and that it can access tcp_sock fields and that their
values are correct.

Run with command: ./test_tcpbpf_user
Adding the flag "-d" will show why it did not pass.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h                 |  86 ++++++++++++++++-
 tools/testing/selftests/bpf/Makefile           |   4 +-
 tools/testing/selftests/bpf/bpf_helpers.h      |   2 +
 tools/testing/selftests/bpf/tcp_client.py      |  51 ++++++++++
 tools/testing/selftests/bpf/tcp_server.py      |  83 ++++++++++++++++
 tools/testing/selftests/bpf/test_tcpbpf.h      |  16 ++++
 tools/testing/selftests/bpf/test_tcpbpf_kern.c | 118 +++++++++++++++++++++++
 tools/testing/selftests/bpf/test_tcpbpf_user.c | 126 +++++++++++++++++++++++++
 8 files changed, 480 insertions(+), 6 deletions(-)
 create mode 100755 tools/testing/selftests/bpf/tcp_client.py
 create mode 100755 tools/testing/selftests/bpf/tcp_server.py
 create mode 100644 tools/testing/selftests/bpf/test_tcpbpf.h
 create mode 100644 tools/testing/selftests/bpf/test_tcpbpf_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_tcpbpf_user.c

(limited to 'tools/include')

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index af1f49ad8b88..db6bdc375126 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -17,7 +17,7 @@
 #define BPF_ALU64	0x07	/* alu mode in double word width */
 
 /* ld/ldx fields */
-#define BPF_DW		0x18	/* double word */
+#define BPF_DW		0x18	/* double word (64-bit) */
 #define BPF_XADD	0xc0	/* exclusive add */
 
 /* alu/jmp fields */
@@ -642,6 +642,14 @@ union bpf_attr {
  *     @optlen: length of optval in bytes
  *     Return: 0 or negative error
  *
+ * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
+ *     Set callback flags for sock_ops
+ *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
+ *     @flags: flags value
+ *     Return: 0 for no error
+ *             -EINVAL if there is no full tcp socket
+ *             bits in flags that are not supported by current kernel
+ *
  * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
  *     Grow or shrink room in sk_buff.
  *     @skb: pointer to skb
@@ -748,7 +756,8 @@ union bpf_attr {
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
 	FN(getsockopt),			\
-	FN(override_return),
+	FN(override_return),		\
+	FN(sock_ops_cb_flags_set),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -952,8 +961,9 @@ struct bpf_map_info {
 struct bpf_sock_ops {
 	__u32 op;
 	union {
-		__u32 reply;
-		__u32 replylong[4];
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
 	};
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
@@ -968,8 +978,39 @@ struct bpf_sock_ops {
 				 */
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u32 lost_out;
+	__u32 sacked_out;
+	__u32 sk_txhash;
+	__u64 bytes_received;
+	__u64 bytes_acked;
 };
 
+/* Definitions for bpf_sock_ops_cb_flags */
+#define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
+#define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x7		/* Mask of all currently
+							 * supported cb flags
+							 */
+
 /* List of known BPF sock_ops operators.
  * New entries can only be added at the end
  */
@@ -1003,6 +1044,43 @@ enum {
 					 * a congestion threshold. RTTs above
 					 * this indicate congestion
 					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 * Arg3: return value of
+					 *       tcp_transmit_skb (0 => success)
+					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+	BPF_TCP_ESTABLISHED = 1,
+	BPF_TCP_SYN_SENT,
+	BPF_TCP_SYN_RECV,
+	BPF_TCP_FIN_WAIT1,
+	BPF_TCP_FIN_WAIT2,
+	BPF_TCP_TIME_WAIT,
+	BPF_TCP_CLOSE,
+	BPF_TCP_CLOSE_WAIT,
+	BPF_TCP_LAST_ACK,
+	BPF_TCP_LISTEN,
+	BPF_TCP_CLOSING,	/* Now a valid state */
+	BPF_TCP_NEW_SYN_RECV,
+
+	BPF_TCP_MAX_STATES	/* Leave at the end! */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3a44b655d852..98688352208b 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -14,13 +14,13 @@ CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../i
 LDLIBS += -lcap -lelf -lrt
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-	test_align test_verifier_log test_dev_cgroup
+	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
-	sample_map_ret0.o
+	sample_map_ret0.o test_tcpbpf_kern.o
 
 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
 	test_offload.py
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 33cb00e46c49..dde2c11d7771 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -71,6 +71,8 @@ static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
 static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval,
 			     int optlen) =
 	(void *) BPF_FUNC_getsockopt;
+static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) =
+	(void *) BPF_FUNC_sock_ops_cb_flags_set;
 static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) =
 	(void *) BPF_FUNC_sk_redirect_map;
 static int (*bpf_sock_map_update)(void *map, void *key, void *value,
diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py
new file mode 100755
index 000000000000..481dccdf140c
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_client.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python2
+#
+# SPDX-License-Identifier: GPL-2.0
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+    buf = ''
+    while len(buf) < n:
+        rem = n - len(buf)
+        try: s = sock.recv(rem)
+        except (socket.error), e: return ''
+        buf += s
+    return buf
+
+def send(sock, s):
+    total = len(s)
+    count = 0
+    while count < total:
+        try: n = sock.send(s)
+        except (socket.error), e: n = 0
+        if n == 0:
+            return count;
+        count += n
+    return count
+
+
+serverPort = int(sys.argv[1])
+HostName = socket.gethostname()
+
+# create active socket
+sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+try:
+    sock.connect((HostName, serverPort))
+except socket.error as e:
+    sys.exit(1)
+
+buf = ''
+n = 0
+while n < 1000:
+    buf += '+'
+    n += 1
+
+sock.settimeout(1);
+n = send(sock, buf)
+n = read(sock, 500)
+sys.exit(0)
diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py
new file mode 100755
index 000000000000..bc454d7d0be2
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_server.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python2
+#
+# SPDX-License-Identifier: GPL-2.0
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+    buf = ''
+    while len(buf) < n:
+        rem = n - len(buf)
+        try: s = sock.recv(rem)
+        except (socket.error), e: return ''
+        buf += s
+    return buf
+
+def send(sock, s):
+    total = len(s)
+    count = 0
+    while count < total:
+        try: n = sock.send(s)
+        except (socket.error), e: n = 0
+        if n == 0:
+            return count;
+        count += n
+    return count
+
+
+SERVER_PORT = 12877
+MAX_PORTS = 2
+
+serverPort = SERVER_PORT
+serverSocket = None
+
+HostName = socket.gethostname()
+
+# create passive socket
+serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+host = socket.gethostname()
+
+try: serverSocket.bind((host, 0))
+except socket.error as msg:
+    print 'bind fails: ', msg
+
+sn = serverSocket.getsockname()
+serverPort = sn[1]
+
+cmdStr = ("./tcp_client.py %d &") % (serverPort)
+os.system(cmdStr)
+
+buf = ''
+n = 0
+while n < 500:
+    buf += '.'
+    n += 1
+
+serverSocket.listen(MAX_PORTS)
+readList = [serverSocket]
+
+while True:
+    readyRead, readyWrite, inError = \
+        select.select(readList, [], [], 2)
+
+    if len(readyRead) > 0:
+        waitCount = 0
+        for sock in readyRead:
+            if sock == serverSocket:
+                (clientSocket, address) = serverSocket.accept()
+                address = str(address[0])
+                readList.append(clientSocket)
+            else:
+                sock.settimeout(1);
+                s = read(sock, 1000)
+                n = send(sock, buf)
+                sock.close()
+                serverSocket.close()
+                sys.exit(0)
+    else:
+        print 'Select timeout!'
+        sys.exit(1)
diff --git a/tools/testing/selftests/bpf/test_tcpbpf.h b/tools/testing/selftests/bpf/test_tcpbpf.h
new file mode 100644
index 000000000000..2fe43289943c
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf.h
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _TEST_TCPBPF_H
+#define _TEST_TCPBPF_H
+
+struct tcpbpf_globals {
+	__u32 event_map;
+	__u32 total_retrans;
+	__u32 data_segs_in;
+	__u32 data_segs_out;
+	__u32 bad_cb_test_rv;
+	__u32 good_cb_test_rv;
+	__u64 bytes_received;
+	__u64 bytes_acked;
+};
+#endif
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
new file mode 100644
index 000000000000..66bf71541903
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/in6.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <netinet/in.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+#include "test_tcpbpf.h"
+
+struct bpf_map_def SEC("maps") global_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct tcpbpf_globals),
+	.max_entries = 2,
+};
+
+static inline void update_event_map(int event)
+{
+	__u32 key = 0;
+	struct tcpbpf_globals g, *gp;
+
+	gp = bpf_map_lookup_elem(&global_map, &key);
+	if (gp == NULL) {
+		struct tcpbpf_globals g = {0};
+
+		g.event_map |= (1 << event);
+		bpf_map_update_elem(&global_map, &key, &g,
+			    BPF_ANY);
+	} else {
+		g = *gp;
+		g.event_map |= (1 << event);
+		bpf_map_update_elem(&global_map, &key, &g,
+			    BPF_ANY);
+	}
+}
+
+int _version SEC("version") = 1;
+
+SEC("sockops")
+int bpf_testcb(struct bpf_sock_ops *skops)
+{
+	int rv = -1;
+	int bad_call_rv = 0;
+	int good_call_rv = 0;
+	int op;
+	int v = 0;
+
+	op = (int) skops->op;
+
+	update_event_map(op);
+
+	switch (op) {
+	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+		/* Test failure to set largest cb flag (assumes not defined) */
+		bad_call_rv = bpf_sock_ops_cb_flags_set(skops, 0x80);
+		/* Set callback */
+		good_call_rv = bpf_sock_ops_cb_flags_set(skops,
+						 BPF_SOCK_OPS_STATE_CB_FLAG);
+		/* Update results */
+		{
+			__u32 key = 0;
+			struct tcpbpf_globals g, *gp;
+
+			gp = bpf_map_lookup_elem(&global_map, &key);
+			if (!gp)
+				break;
+			g = *gp;
+			g.bad_cb_test_rv = bad_call_rv;
+			g.good_cb_test_rv = good_call_rv;
+			bpf_map_update_elem(&global_map, &key, &g,
+					    BPF_ANY);
+		}
+		break;
+	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+		/* Set callback */
+//		good_call_rv = bpf_sock_ops_cb_flags_set(skops,
+//						 BPF_SOCK_OPS_STATE_CB_FLAG);
+		skops->sk_txhash = 0x12345f;
+		v = 0xff;
+		rv = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v,
+				    sizeof(v));
+		break;
+	case BPF_SOCK_OPS_RTO_CB:
+		break;
+	case BPF_SOCK_OPS_RETRANS_CB:
+		break;
+	case BPF_SOCK_OPS_STATE_CB:
+		if (skops->args[1] == BPF_TCP_CLOSE) {
+			__u32 key = 0;
+			struct tcpbpf_globals g, *gp;
+
+			gp = bpf_map_lookup_elem(&global_map, &key);
+			if (!gp)
+				break;
+			g = *gp;
+			g.total_retrans = skops->total_retrans;
+			g.data_segs_in = skops->data_segs_in;
+			g.data_segs_out = skops->data_segs_out;
+			g.bytes_received = skops->bytes_received;
+			g.bytes_acked = skops->bytes_acked;
+			bpf_map_update_elem(&global_map, &key, &g,
+					    BPF_ANY);
+		}
+		break;
+	default:
+		rv = -1;
+	}
+	skops->reply = rv;
+	return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c
new file mode 100644
index 000000000000..95a370f3d378
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <assert.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "bpf_util.h"
+#include <linux/perf_event.h>
+#include "test_tcpbpf.h"
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+			const char *name)
+{
+	struct bpf_map *map;
+
+	map = bpf_object__find_map_by_name(obj, name);
+	if (!map) {
+		printf("%s:FAIL:map '%s' not found\n", test, name);
+		return -1;
+	}
+	return bpf_map__fd(map);
+}
+
+#define SYSTEM(CMD)						\
+	do {							\
+		if (system(CMD)) {				\
+			printf("system(%s) FAILS!\n", CMD);	\
+		}						\
+	} while (0)
+
+int main(int argc, char **argv)
+{
+	const char *file = "test_tcpbpf_kern.o";
+	struct tcpbpf_globals g = {0};
+	int cg_fd, prog_fd, map_fd;
+	bool debug_flag = false;
+	int error = EXIT_FAILURE;
+	struct bpf_object *obj;
+	char cmd[100], *dir;
+	struct stat buffer;
+	__u32 key = 0;
+	int pid;
+	int rv;
+
+	if (argc > 1 && strcmp(argv[1], "-d") == 0)
+		debug_flag = true;
+
+	dir = "/tmp/cgroupv2/foo";
+
+	if (stat(dir, &buffer) != 0) {
+		SYSTEM("mkdir -p /tmp/cgroupv2");
+		SYSTEM("mount -t cgroup2 none /tmp/cgroupv2");
+		SYSTEM("mkdir -p /tmp/cgroupv2/foo");
+	}
+	pid = (int) getpid();
+	sprintf(cmd, "echo %d >> /tmp/cgroupv2/foo/cgroup.procs", pid);
+	SYSTEM(cmd);
+
+	cg_fd = open(dir, O_DIRECTORY, O_RDONLY);
+	if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
+		printf("FAILED: load_bpf_file failed for: %s\n", file);
+		goto err;
+	}
+
+	rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+	if (rv) {
+		printf("FAILED: bpf_prog_attach: %d (%s)\n",
+		       error, strerror(errno));
+		goto err;
+	}
+
+	SYSTEM("./tcp_server.py");
+
+	map_fd = bpf_find_map(__func__, obj, "global_map");
+	if (map_fd < 0)
+		goto err;
+
+	rv = bpf_map_lookup_elem(map_fd, &key, &g);
+	if (rv != 0) {
+		printf("FAILED: bpf_map_lookup_elem returns %d\n", rv);
+		goto err;
+	}
+
+	if (g.bytes_received != 501 || g.bytes_acked != 1002 ||
+	    g.data_segs_in != 1 || g.data_segs_out != 1 ||
+	    (g.event_map ^ 0x47e) != 0 || g.bad_cb_test_rv != 0x80 ||
+		g.good_cb_test_rv != 0) {
+		printf("FAILED: Wrong stats\n");
+		if (debug_flag) {
+			printf("\n");
+			printf("bytes_received: %d (expecting 501)\n",
+			       (int)g.bytes_received);
+			printf("bytes_acked:    %d (expecting 1002)\n",
+			       (int)g.bytes_acked);
+			printf("data_segs_in:   %d (expecting 1)\n",
+			       g.data_segs_in);
+			printf("data_segs_out:  %d (expecting 1)\n",
+			       g.data_segs_out);
+			printf("event_map:      0x%x (at least 0x47e)\n",
+			       g.event_map);
+			printf("bad_cb_test_rv: 0x%x (expecting 0x80)\n",
+			       g.bad_cb_test_rv);
+			printf("good_cb_test_rv:0x%x (expecting 0)\n",
+			       g.good_cb_test_rv);
+		}
+		goto err;
+	}
+	printf("PASSED!\n");
+	error = 0;
+err:
+	bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+	return error;
+
+}
-- 
cgit v1.2.3