summaryrefslogtreecommitdiffstats
path: root/tools/lib
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2020-05-29 17:11:09 +0200
committerAlexei Starovoitov <ast@kernel.org>2020-06-01 14:38:23 -0700
commite255d3273920dfd2d4e1cf2afe565b942c122219 (patch)
tree2af8a49b786c2f9e879b3433c7962464cfa77518 /tools/lib
parent43dd115b1fffdd8d2c4cc15659c00b2a1addbc43 (diff)
parent97abb2b396821f21c21cee2d537bb4e0a0eef31b (diff)
downloadlinux-e255d3273920dfd2d4e1cf2afe565b942c122219.tar.gz
linux-e255d3273920dfd2d4e1cf2afe565b942c122219.zip
Merge branch 'bpf-ring-buffer'
Andrii Nakryiko says: ==================== Implement a new BPF ring buffer, as presented at BPF virtual conference ([0]). It presents an alternative to perf buffer, following its semantics closely, but allowing sharing same instance of ring buffer across multiple CPUs efficiently. Most patches have extensive commentary explaining various aspects, so I'll keep cover letter short. Overall structure of the patch set: - patch #1 adds BPF ring buffer implementation to kernel and necessary verifier support; - patch #2 adds libbpf consumer implementation for BPF ringbuf; - patch #3 adds selftest, both for single BPF ring buf use case, as well as using it with array/hash of maps; - patch #4 adds extensive benchmarks and provide some analysis in commit message, it builds upon selftests/bpf's bench runner. - patch #5 adds most of patch #1 commit message as a doc under Documentation/bpf/ringbuf.rst. Litmus tests, validating consumer/producer protocols and memory orderings, were moved out as discussed in [1] and are going to be posted against -rcu tree and put under Documentation/litmus-tests/bpf-rb. [0] https://docs.google.com/presentation/d/18ITdg77Bj6YDOH2LghxrnFxiPWe0fAqcmJY95t_qr0w [1] https://lkml.org/lkml/2020/5/22/1011 v3->v4: - fix ringbuf freeing (vunmap, __free_page); verified with a trivial loop creating and closing ringbuf map endlessly (Daniel); v2->v3: - dropped unnecessary smp_wmb() (Paul); - verifier reference type enhancement patch was dropped (Alexei); - better verifier message for various memory access checks (Alexei); - clarified a bit roundup_len() bit shifting (Alexei); - converted doc to .rst (Alexei); - fixed warning on 32-bit arches regarding tautological ring area size check. v1->v2: - commit()/discard()/output() accept flags (NO_WAKEUP/FORCE_WAKEUP) (Stanislav); - bpf_ringbuf_query() added, returning available data size, ringbuf size, consumer/producer positions, needed to implement smarter notification policy (Stanislav); - added ringbuf UAPI constants to include/uapi/linux/bpf.h (Jonathan); - fixed sample size check, added proper ringbuf size check (Jonathan, Alexei); - wake_up_all() is done through irq_work (Alexei); - consistent use of smp_load_acquire/smp_store_release, no READ_ONCE/WRITE_ONCE (Alexei); - added Documentation/bpf/ringbuf.txt (Stanislav); - updated litmus test with smp_load_acquire/smp_store_release changes; - added ring_buffer__consume() API to libbpf for busy-polling; - ring_buffer__poll() on success returns number of records consumed; - fixed EPOLL notifications, don't assume available data, done similarly to perfbuf's implementation; - both ringbuf and perfbuf now have --rb-sampled mode, instead of pb-raw/pb-custom mode, updated benchmark results; - extended ringbuf selftests to validate epoll logic/manual notification logic, as well as bpf_ringbuf_query(). ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Diffstat (limited to 'tools/lib')
-rw-r--r--tools/lib/bpf/Build2
-rw-r--r--tools/lib/bpf/libbpf.h21
-rw-r--r--tools/lib/bpf/libbpf.map5
-rw-r--r--tools/lib/bpf/libbpf_probes.c5
-rw-r--r--tools/lib/bpf/ringbuf.c285
5 files changed, 317 insertions, 1 deletions
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index e3962cfbc9a6..190366d05588 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1,3 +1,3 @@
libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \
- btf_dump.o
+ btf_dump.o ringbuf.o
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 1e2e399a5f2c..8528a02d5af8 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -478,6 +478,27 @@ LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags);
LIBBPF_API int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info,
size_t info_size, __u32 flags);
+/* Ring buffer APIs */
+struct ring_buffer;
+
+typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size);
+
+struct ring_buffer_opts {
+ size_t sz; /* size of this struct, for forward/backward compatiblity */
+};
+
+#define ring_buffer_opts__last_field sz
+
+LIBBPF_API struct ring_buffer *
+ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx,
+ const struct ring_buffer_opts *opts);
+LIBBPF_API void ring_buffer__free(struct ring_buffer *rb);
+LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd,
+ ring_buffer_sample_fn sample_cb, void *ctx);
+LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms);
+LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb);
+
+/* Perf buffer APIs */
struct perf_buffer;
typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 381a7342ecfc..c18860200abb 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -263,4 +263,9 @@ LIBBPF_0.0.9 {
bpf_link_get_next_id;
bpf_program__attach_iter;
perf_buffer__consume;
+ ring_buffer__add;
+ ring_buffer__consume;
+ ring_buffer__free;
+ ring_buffer__new;
+ ring_buffer__poll;
} LIBBPF_0.0.8;
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 2c92059c0c90..10cd8d1891f5 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -238,6 +238,11 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
if (btf_fd < 0)
return false;
break;
+ case BPF_MAP_TYPE_RINGBUF:
+ key_size = 0;
+ value_size = 0;
+ max_entries = 4096;
+ break;
case BPF_MAP_TYPE_UNSPEC:
case BPF_MAP_TYPE_HASH:
case BPF_MAP_TYPE_ARRAY:
diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
new file mode 100644
index 000000000000..bc10fa1d43c7
--- /dev/null
+++ b/tools/lib/bpf/ringbuf.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+/*
+ * Ring buffer operations.
+ *
+ * Copyright (C) 2020 Facebook, Inc.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <linux/err.h>
+#include <linux/bpf.h>
+#include <asm/barrier.h>
+#include <sys/mman.h>
+#include <sys/epoll.h>
+#include <tools/libc_compat.h>
+
+#include "libbpf.h"
+#include "libbpf_internal.h"
+#include "bpf.h"
+
+/* make sure libbpf doesn't use kernel-only integer typedefs */
+#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
+
+struct ring {
+ ring_buffer_sample_fn sample_cb;
+ void *ctx;
+ void *data;
+ unsigned long *consumer_pos;
+ unsigned long *producer_pos;
+ unsigned long mask;
+ int map_fd;
+};
+
+struct ring_buffer {
+ struct epoll_event *events;
+ struct ring *rings;
+ size_t page_size;
+ int epoll_fd;
+ int ring_cnt;
+};
+
+static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r)
+{
+ if (r->consumer_pos) {
+ munmap(r->consumer_pos, rb->page_size);
+ r->consumer_pos = NULL;
+ }
+ if (r->producer_pos) {
+ munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1));
+ r->producer_pos = NULL;
+ }
+}
+
+/* Add extra RINGBUF maps to this ring buffer manager */
+int ring_buffer__add(struct ring_buffer *rb, int map_fd,
+ ring_buffer_sample_fn sample_cb, void *ctx)
+{
+ struct bpf_map_info info;
+ __u32 len = sizeof(info);
+ struct epoll_event *e;
+ struct ring *r;
+ void *tmp;
+ int err;
+
+ memset(&info, 0, sizeof(info));
+
+ err = bpf_obj_get_info_by_fd(map_fd, &info, &len);
+ if (err) {
+ err = -errno;
+ pr_warn("ringbuf: failed to get map info for fd=%d: %d\n",
+ map_fd, err);
+ return err;
+ }
+
+ if (info.type != BPF_MAP_TYPE_RINGBUF) {
+ pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n",
+ map_fd);
+ return -EINVAL;
+ }
+
+ tmp = reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings));
+ if (!tmp)
+ return -ENOMEM;
+ rb->rings = tmp;
+
+ tmp = reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events));
+ if (!tmp)
+ return -ENOMEM;
+ rb->events = tmp;
+
+ r = &rb->rings[rb->ring_cnt];
+ memset(r, 0, sizeof(*r));
+
+ r->map_fd = map_fd;
+ r->sample_cb = sample_cb;
+ r->ctx = ctx;
+ r->mask = info.max_entries - 1;
+
+ /* Map writable consumer page */
+ tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ map_fd, 0);
+ if (tmp == MAP_FAILED) {
+ err = -errno;
+ pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n",
+ map_fd, err);
+ return err;
+ }
+ r->consumer_pos = tmp;
+
+ /* Map read-only producer page and data pages. We map twice as big
+ * data size to allow simple reading of samples that wrap around the
+ * end of a ring buffer. See kernel implementation for details.
+ * */
+ tmp = mmap(NULL, rb->page_size + 2 * info.max_entries, PROT_READ,
+ MAP_SHARED, map_fd, rb->page_size);
+ if (tmp == MAP_FAILED) {
+ err = -errno;
+ ringbuf_unmap_ring(rb, r);
+ pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n",
+ map_fd, err);
+ return err;
+ }
+ r->producer_pos = tmp;
+ r->data = tmp + rb->page_size;
+
+ e = &rb->events[rb->ring_cnt];
+ memset(e, 0, sizeof(*e));
+
+ e->events = EPOLLIN;
+ e->data.fd = rb->ring_cnt;
+ if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) {
+ err = -errno;
+ ringbuf_unmap_ring(rb, r);
+ pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n",
+ map_fd, err);
+ return err;
+ }
+
+ rb->ring_cnt++;
+ return 0;
+}
+
+void ring_buffer__free(struct ring_buffer *rb)
+{
+ int i;
+
+ if (!rb)
+ return;
+
+ for (i = 0; i < rb->ring_cnt; ++i)
+ ringbuf_unmap_ring(rb, &rb->rings[i]);
+ if (rb->epoll_fd >= 0)
+ close(rb->epoll_fd);
+
+ free(rb->events);
+ free(rb->rings);
+ free(rb);
+}
+
+struct ring_buffer *
+ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx,
+ const struct ring_buffer_opts *opts)
+{
+ struct ring_buffer *rb;
+ int err;
+
+ if (!OPTS_VALID(opts, ring_buffer_opts))
+ return NULL;
+
+ rb = calloc(1, sizeof(*rb));
+ if (!rb)
+ return NULL;
+
+ rb->page_size = getpagesize();
+
+ rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+ if (rb->epoll_fd < 0) {
+ err = -errno;
+ pr_warn("ringbuf: failed to create epoll instance: %d\n", err);
+ goto err_out;
+ }
+
+ err = ring_buffer__add(rb, map_fd, sample_cb, ctx);
+ if (err)
+ goto err_out;
+
+ return rb;
+
+err_out:
+ ring_buffer__free(rb);
+ return NULL;
+}
+
+static inline int roundup_len(__u32 len)
+{
+ /* clear out top 2 bits (discard and busy, if set) */
+ len <<= 2;
+ len >>= 2;
+ /* add length prefix */
+ len += BPF_RINGBUF_HDR_SZ;
+ /* round up to 8 byte alignment */
+ return (len + 7) / 8 * 8;
+}
+
+static int ringbuf_process_ring(struct ring* r)
+{
+ int *len_ptr, len, err, cnt = 0;
+ unsigned long cons_pos, prod_pos;
+ bool got_new_data;
+ void *sample;
+
+ cons_pos = smp_load_acquire(r->consumer_pos);
+ do {
+ got_new_data = false;
+ prod_pos = smp_load_acquire(r->producer_pos);
+ while (cons_pos < prod_pos) {
+ len_ptr = r->data + (cons_pos & r->mask);
+ len = smp_load_acquire(len_ptr);
+
+ /* sample not committed yet, bail out for now */
+ if (len & BPF_RINGBUF_BUSY_BIT)
+ goto done;
+
+ got_new_data = true;
+ cons_pos += roundup_len(len);
+
+ if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) {
+ sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
+ err = r->sample_cb(r->ctx, sample, len);
+ if (err) {
+ /* update consumer pos and bail out */
+ smp_store_release(r->consumer_pos,
+ cons_pos);
+ return err;
+ }
+ cnt++;
+ }
+
+ smp_store_release(r->consumer_pos, cons_pos);
+ }
+ } while (got_new_data);
+done:
+ return cnt;
+}
+
+/* Consume available ring buffer(s) data without event polling.
+ * Returns number of records consumed across all registered ring buffers, or
+ * negative number if any of the callbacks return error.
+ */
+int ring_buffer__consume(struct ring_buffer *rb)
+{
+ int i, err, res = 0;
+
+ for (i = 0; i < rb->ring_cnt; i++) {
+ struct ring *ring = &rb->rings[i];
+
+ err = ringbuf_process_ring(ring);
+ if (err < 0)
+ return err;
+ res += err;
+ }
+ return res;
+}
+
+/* Poll for available data and consume records, if any are available.
+ * Returns number of records consumed, or negative number, if any of the
+ * registered callbacks returned error.
+ */
+int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
+{
+ int i, cnt, err, res = 0;
+
+ cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms);
+ for (i = 0; i < cnt; i++) {
+ __u32 ring_id = rb->events[i].data.fd;
+ struct ring *ring = &rb->rings[ring_id];
+
+ err = ringbuf_process_ring(ring);
+ if (err < 0)
+ return err;
+ res += cnt;
+ }
+ return cnt < 0 ? -errno : res;
+}