aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/nhc_udp.c4
-rw-r--r--net/8021q/vlan_core.c10
-rw-r--r--net/8021q/vlan_dev.c21
-rw-r--r--net/Kconfig14
-rw-r--r--net/ax25/af_ax25.c1
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c5
-rw-r--r--net/bpf/test_run.c244
-rw-r--r--net/bridge/br_arp_nd_proxy.c4
-rw-r--r--net/bridge/br_device.c49
-rw-r--r--net/bridge/br_fdb.c50
-rw-r--r--net/bridge/br_input.c1
-rw-r--r--net/bridge/br_mdb.c148
-rw-r--r--net/bridge/br_mrp.c7
-rw-r--r--net/bridge/br_multicast_eht.c141
-rw-r--r--net/bridge/br_private.h20
-rw-r--r--net/bridge/br_stp.c27
-rw-r--r--net/bridge/br_vlan.c128
-rw-r--r--net/bridge/br_vlan_tunnel.c2
-rw-r--r--net/core/Makefile6
-rw-r--r--net/core/bpf_sk_storage.c2
-rw-r--r--net/core/dev.c364
-rw-r--r--net/core/drop_monitor.c2
-rw-r--r--net/core/filter.c269
-rw-r--r--net/core/flow_dissector.c41
-rw-r--r--net/core/net-procfs.c3
-rw-r--r--net/core/net-sysfs.c177
-rw-r--r--net/core/skbuff.c15
-rw-r--r--net/core/skmsg.c212
-rw-r--r--net/core/sock_map.c77
-rw-r--r--net/core/sysctl_net_core.c10
-rw-r--r--net/decnet/dn_route.c49
-rw-r--r--net/dsa/Kconfig17
-rw-r--r--net/dsa/dsa_priv.h23
-rw-r--r--net/dsa/port.c197
-rw-r--r--net/dsa/slave.c47
-rw-r--r--net/dsa/tag_brcm.c107
-rw-r--r--net/dsa/tag_mtk.c14
-rw-r--r--net/dsa/tag_ocelot.c8
-rw-r--r--net/ethernet/eth.c13
-rw-r--r--net/ethtool/ioctl.c12
-rw-r--r--net/hsr/hsr_debugfs.c2
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/nexthop.c1492
-rw-r--r--net/ipv4/route.c183
-rw-r--r--net/ipv4/tcp_bpf.c4
-rw-r--r--net/ipv4/tcp_input.c10
-rw-r--r--net/ipv4/tcp_output.c20
-rw-r--r--net/ipv6/exthdrs.c5
-rw-r--r--net/ipv6/route.c26
-rw-r--r--net/ipv6/seg6_local.c11
-rw-r--r--net/l2tp/l2tp_core.c2
-rw-r--r--net/lapb/lapb_iface.c4
-rw-r--r--net/lapb/lapb_timer.c19
-rw-r--r--net/mptcp/options.c47
-rw-r--r--net/mptcp/pm.c39
-rw-r--r--net/mptcp/pm_netlink.c139
-rw-r--r--net/mptcp/protocol.h27
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c1
-rw-r--r--net/netfilter/nf_flow_table_core.c106
-rw-r--r--net/netfilter/nf_flow_table_ip.c461
-rw-r--r--net/netfilter/nf_flow_table_offload.c223
-rw-r--r--net/netfilter/nf_tables_api.c32
-rw-r--r--net/netfilter/nft_ct.c1
-rw-r--r--net/netfilter/nft_flow_offload.c211
-rw-r--r--net/openvswitch/vport.c8
-rw-r--r--net/openvswitch/vport.h2
-rw-r--r--net/packet/af_packet.c2
-rw-r--r--net/psample/psample.c45
-rw-r--r--net/rds/recv.c4
-rw-r--r--net/rose/rose_route.c2
-rw-r--r--net/sched/act_police.c59
-rw-r--r--net/sched/act_sample.c27
-rw-r--r--net/sched/cls_api.c3
-rw-r--r--net/sched/cls_flower.c40
-rw-r--r--net/sched/sch_cbq.c4
-rw-r--r--net/sched/sch_generic.c75
-rw-r--r--net/sched/sch_taprio.c64
-rw-r--r--net/tipc/addr.c1
-rw-r--r--net/tipc/addr.h46
-rw-r--r--net/tipc/crypto.c12
-rw-r--r--net/tipc/monitor.c63
-rw-r--r--net/tipc/msg.c23
-rw-r--r--net/tipc/name_distr.c93
-rw-r--r--net/tipc/name_table.c426
-rw-r--r--net/tipc/name_table.h63
-rw-r--r--net/tipc/net.c8
-rw-r--r--net/tipc/netlink_compat.c2
-rw-r--r--net/tipc/node.c33
-rw-r--r--net/tipc/socket.c319
-rw-r--r--net/tipc/subscr.c86
-rw-r--r--net/tipc/subscr.h14
-rw-r--r--net/tls/tls_device.c2
-rw-r--r--net/xdp/xsk.c114
-rw-r--r--net/xdp/xsk_queue.h30
-rw-r--r--net/xdp/xskmap.c17
95 files changed, 5148 insertions, 2147 deletions
diff --git a/net/6lowpan/nhc_udp.c b/net/6lowpan/nhc_udp.c
index 8a3507524f7b..33f17bd8cda7 100644
--- a/net/6lowpan/nhc_udp.c
+++ b/net/6lowpan/nhc_udp.c
@@ -5,7 +5,7 @@
* Authors:
* Alexander Aring <aar@pengutronix.de>
*
- * Orignal written by:
+ * Original written by:
* Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
* Jon Smirl <jonsmirl@gmail.com>
*/
@@ -82,7 +82,7 @@ static int udp_uncompress(struct sk_buff *skb, size_t needed)
if (fail)
return -EINVAL;
- /* UDP length needs to be infered from the lower layers
+ /* UDP length needs to be inferred from the lower layers
* here, we obtain the hint from the remaining size of the
* frame
*/
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 78ec2e1b14d1..59bc13b5f14f 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -4,6 +4,7 @@
#include <linux/if_vlan.h>
#include <linux/netpoll.h>
#include <linux/export.h>
+#include <net/gro.h>
#include "vlan.h"
bool vlan_do_receive(struct sk_buff **skbp)
@@ -495,7 +496,10 @@ static struct sk_buff *vlan_gro_receive(struct list_head *head,
skb_gro_pull(skb, sizeof(*vhdr));
skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
- pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
+
+ pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ head, skb);
out_unlock:
rcu_read_unlock();
@@ -515,7 +519,9 @@ static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
rcu_read_lock();
ptype = gro_find_complete_by_type(type);
if (ptype)
- err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr));
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, nhoff + sizeof(*vhdr));
rcu_read_unlock();
return err;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index dc1a197792e6..4db3f0621959 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -776,6 +776,26 @@ static int vlan_dev_get_iflink(const struct net_device *dev)
return real_dev->ifindex;
}
+static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev);
+
+ path->type = DEV_PATH_VLAN;
+ path->encap.id = vlan->vlan_id;
+ path->encap.proto = vlan->vlan_proto;
+ path->dev = ctx->dev;
+ ctx->dev = vlan->real_dev;
+ if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan))
+ return -ENOSPC;
+
+ ctx->vlan[ctx->num_vlans].id = vlan->vlan_id;
+ ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto;
+ ctx->num_vlans++;
+
+ return 0;
+}
+
static const struct ethtool_ops vlan_ethtool_ops = {
.get_link_ksettings = vlan_ethtool_get_link_ksettings,
.get_drvinfo = vlan_ethtool_get_drvinfo,
@@ -814,6 +834,7 @@ static const struct net_device_ops vlan_netdev_ops = {
#endif
.ndo_fix_features = vlan_dev_fix_features,
.ndo_get_iflink = vlan_dev_get_iflink,
+ .ndo_fill_forward_path = vlan_dev_fill_forward_path,
};
static void vlan_dev_free(struct net_device *dev)
diff --git a/net/Kconfig b/net/Kconfig
index 8cea808ad9e8..9c456acc379e 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -245,6 +245,14 @@ source "net/l3mdev/Kconfig"
source "net/qrtr/Kconfig"
source "net/ncsi/Kconfig"
+config PCPU_DEV_REFCNT
+ bool "Use percpu variables to maintain network device refcount"
+ depends on SMP
+ default y
+ help
+ network device refcount are using per cpu variables if this option is set.
+ This can be forced to N to detect underflows (with a performance drop).
+
config RPS
bool
depends on SMP && SYSFS
@@ -317,13 +325,9 @@ config BPF_STREAM_PARSER
select STREAM_PARSER
select NET_SOCK_MSG
help
- Enabling this allows a stream parser to be used with
+ Enabling this allows a TCP stream parser to be used with
BPF_MAP_TYPE_SOCKMAP.
- BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets.
- It can be used to enforce socket policy, implement socket redirects,
- etc.
-
config NET_FLOW_LIMIT
bool
depends on RPS
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 269ee89d2c2b..2631efc6e359 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -850,6 +850,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
case AX25_P_ROSE:
if (ax25_protocol_is_registered(AX25_P_ROSE))
return -ESOCKTNOSUPPORT;
+ break;
#endif
default:
break;
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 360bdbf44748..bcd543ce835b 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -438,10 +438,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
skb->len + ETH_HLEN);
- if (in_interrupt())
- netif_rx(skb);
- else
- netif_rx_ni(skb);
+ netif_rx_any_context(skb);
out:
if (primary_if)
batadv_hardif_put(primary_if);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 58bcb8c849d5..0abdd67f44b1 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -10,20 +10,86 @@
#include <net/bpf_sk_storage.h>
#include <net/sock.h>
#include <net/tcp.h>
+#include <net/net_namespace.h>
#include <linux/error-injection.h>
#include <linux/smp.h>
+#include <linux/sock_diag.h>
#define CREATE_TRACE_POINTS
#include <trace/events/bpf_test_run.h>
+struct bpf_test_timer {
+ enum { NO_PREEMPT, NO_MIGRATE } mode;
+ u32 i;
+ u64 time_start, time_spent;
+};
+
+static void bpf_test_timer_enter(struct bpf_test_timer *t)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ if (t->mode == NO_PREEMPT)
+ preempt_disable();
+ else
+ migrate_disable();
+
+ t->time_start = ktime_get_ns();
+}
+
+static void bpf_test_timer_leave(struct bpf_test_timer *t)
+ __releases(rcu)
+{
+ t->time_start = 0;
+
+ if (t->mode == NO_PREEMPT)
+ preempt_enable();
+ else
+ migrate_enable();
+ rcu_read_unlock();
+}
+
+static bool bpf_test_timer_continue(struct bpf_test_timer *t, u32 repeat, int *err, u32 *duration)
+ __must_hold(rcu)
+{
+ t->i++;
+ if (t->i >= repeat) {
+ /* We're done. */
+ t->time_spent += ktime_get_ns() - t->time_start;
+ do_div(t->time_spent, t->i);
+ *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent;
+ *err = 0;
+ goto reset;
+ }
+
+ if (signal_pending(current)) {
+ /* During iteration: we've been cancelled, abort. */
+ *err = -EINTR;
+ goto reset;
+ }
+
+ if (need_resched()) {
+ /* During iteration: we need to reschedule between runs. */
+ t->time_spent += ktime_get_ns() - t->time_start;
+ bpf_test_timer_leave(t);
+ cond_resched();
+ bpf_test_timer_enter(t);
+ }
+
+ /* Do another round. */
+ return true;
+
+reset:
+ t->i = 0;
+ return false;
+}
+
static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
u32 *retval, u32 *time, bool xdp)
{
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL };
+ struct bpf_test_timer t = { NO_MIGRATE };
enum bpf_cgroup_storage_type stype;
- u64 time_start, time_spent = 0;
- int ret = 0;
- u32 i;
+ int ret;
for_each_cgroup_storage_type(stype) {
storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
@@ -38,40 +104,16 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
if (!repeat)
repeat = 1;
- rcu_read_lock();
- migrate_disable();
- time_start = ktime_get_ns();
- for (i = 0; i < repeat; i++) {
+ bpf_test_timer_enter(&t);
+ do {
bpf_cgroup_storage_set(storage);
if (xdp)
*retval = bpf_prog_run_xdp(prog, ctx);
else
*retval = BPF_PROG_RUN(prog, ctx);
-
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- if (need_resched()) {
- time_spent += ktime_get_ns() - time_start;
- migrate_enable();
- rcu_read_unlock();
-
- cond_resched();
-
- rcu_read_lock();
- migrate_disable();
- time_start = ktime_get_ns();
- }
- }
- time_spent += ktime_get_ns() - time_start;
- migrate_enable();
- rcu_read_unlock();
-
- do_div(time_spent, repeat);
- *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
+ } while (bpf_test_timer_continue(&t, repeat, &ret, time));
+ bpf_test_timer_leave(&t);
for_each_cgroup_storage_type(stype)
bpf_cgroup_storage_free(storage[stype]);
@@ -674,18 +716,17 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
+ struct bpf_test_timer t = { NO_PREEMPT };
u32 size = kattr->test.data_size_in;
struct bpf_flow_dissector ctx = {};
u32 repeat = kattr->test.repeat;
struct bpf_flow_keys *user_ctx;
struct bpf_flow_keys flow_keys;
- u64 time_start, time_spent = 0;
const struct ethhdr *eth;
unsigned int flags = 0;
u32 retval, duration;
void *data;
int ret;
- u32 i;
if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
return -EINVAL;
@@ -721,48 +762,127 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
ctx.data = data;
ctx.data_end = (__u8 *)data + size;
- rcu_read_lock();
- preempt_disable();
- time_start = ktime_get_ns();
- for (i = 0; i < repeat; i++) {
+ bpf_test_timer_enter(&t);
+ do {
retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
size, flags);
+ } while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
+ bpf_test_timer_leave(&t);
- if (signal_pending(current)) {
- preempt_enable();
- rcu_read_unlock();
+ if (ret < 0)
+ goto out;
- ret = -EINTR;
- goto out;
- }
+ ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
+ retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, user_ctx,
+ sizeof(struct bpf_flow_keys));
- if (need_resched()) {
- time_spent += ktime_get_ns() - time_start;
- preempt_enable();
- rcu_read_unlock();
+out:
+ kfree(user_ctx);
+ kfree(data);
+ return ret;
+}
- cond_resched();
+int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_test_timer t = { NO_PREEMPT };
+ struct bpf_prog_array *progs = NULL;
+ struct bpf_sk_lookup_kern ctx = {};
+ u32 repeat = kattr->test.repeat;
+ struct bpf_sk_lookup *user_ctx;
+ u32 retval, duration;
+ int ret = -EINVAL;
- rcu_read_lock();
- preempt_disable();
- time_start = ktime_get_ns();
- }
+ if (prog->type != BPF_PROG_TYPE_SK_LOOKUP)
+ return -EINVAL;
+
+ if (kattr->test.flags || kattr->test.cpu)
+ return -EINVAL;
+
+ if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out ||
+ kattr->test.data_size_out)
+ return -EINVAL;
+
+ if (!repeat)
+ repeat = 1;
+
+ user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx));
+ if (IS_ERR(user_ctx))
+ return PTR_ERR(user_ctx);
+
+ if (!user_ctx)
+ return -EINVAL;
+
+ if (user_ctx->sk)
+ goto out;
+
+ if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx)))
+ goto out;
+
+ if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) {
+ ret = -ERANGE;
+ goto out;
}
- time_spent += ktime_get_ns() - time_start;
- preempt_enable();
- rcu_read_unlock();
- do_div(time_spent, repeat);
- duration = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
+ ctx.family = (u16)user_ctx->family;
+ ctx.protocol = (u16)user_ctx->protocol;
+ ctx.dport = (u16)user_ctx->local_port;
+ ctx.sport = (__force __be16)user_ctx->remote_port;
- ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
- retval, duration);
+ switch (ctx.family) {
+ case AF_INET:
+ ctx.v4.daddr = (__force __be32)user_ctx->local_ip4;
+ ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4;
+ break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6;
+ ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6;
+ break;
+#endif
+
+ default:
+ ret = -EAFNOSUPPORT;
+ goto out;
+ }
+
+ progs = bpf_prog_array_alloc(1, GFP_KERNEL);
+ if (!progs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ progs->items[0].prog = prog;
+
+ bpf_test_timer_enter(&t);
+ do {
+ ctx.selected_sk = NULL;
+ retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN);
+ } while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
+ bpf_test_timer_leave(&t);
+
+ if (ret < 0)
+ goto out;
+
+ user_ctx->cookie = 0;
+ if (ctx.selected_sk) {
+ if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ user_ctx->cookie = sock_gen_cookie(ctx.selected_sk);
+ }
+
+ ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration);
if (!ret)
- ret = bpf_ctx_finish(kattr, uattr, user_ctx,
- sizeof(struct bpf_flow_keys));
+ ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx));
out:
+ bpf_prog_array_free(progs);
kfree(user_ctx);
- kfree(data);
return ret;
}
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index dfec65eca8a6..3db1def4437b 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -160,7 +160,9 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
if (p && (p->flags & BR_NEIGH_SUPPRESS))
return;
- if (ipv4_is_zeronet(sip) || sip == tip) {
+ if (parp->ar_op != htons(ARPOP_RREQUEST) &&
+ parp->ar_op != htons(ARPOP_RREPLY) &&
+ (ipv4_is_zeronet(sip) || sip == tip)) {
/* prevent flooding to neigh suppress ports */
BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 3f2f06b4dd27..e8b626cc6bfd 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -385,6 +385,54 @@ static int br_del_slave(struct net_device *dev, struct net_device *slave_dev)
return br_del_if(br, slave_dev);
}
+static int br_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_bridge_port *dst;
+ struct net_bridge *br;
+
+ if (netif_is_bridge_port(ctx->dev))
+ return -1;
+
+ br = netdev_priv(ctx->dev);
+
+ br_vlan_fill_forward_path_pvid(br, ctx, path);
+
+ f = br_fdb_find_rcu(br, ctx->daddr, path->bridge.vlan_id);
+ if (!f || !f->dst)
+ return -1;
+
+ dst = READ_ONCE(f->dst);
+ if (!dst)
+ return -1;
+
+ if (br_vlan_fill_forward_path_mode(br, dst, path))
+ return -1;
+
+ path->type = DEV_PATH_BRIDGE;
+ path->dev = dst->br->dev;
+ ctx->dev = dst->dev;
+
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_TAG:
+ if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan))
+ return -ENOSPC;
+ ctx->vlan[ctx->num_vlans].id = path->bridge.vlan_id;
+ ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto;
+ ctx->num_vlans++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ case DEV_PATH_BR_VLAN_UNTAG:
+ ctx->num_vlans--;
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
+
+ return 0;
+}
+
static const struct ethtool_ops br_ethtool_ops = {
.get_drvinfo = br_getinfo,
.get_link = ethtool_op_get_link,
@@ -419,6 +467,7 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_bridge_setlink = br_setlink,
.ndo_bridge_dellink = br_dellink,
.ndo_features_check = passthru_features_check,
+ .ndo_fill_forward_path = br_fill_forward_path,
};
static struct device_type br_type = {
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index b7490237f3fc..698b79747d32 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -726,6 +726,56 @@ static inline size_t fdb_nlmsg_size(void)
+ nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */
}
+static int br_fdb_replay_one(struct notifier_block *nb,
+ struct net_bridge_fdb_entry *fdb,
+ struct net_device *dev)
+{
+ struct switchdev_notifier_fdb_info item;
+ int err;
+
+ item.addr = fdb->key.addr.addr;
+ item.vid = fdb->key.vlan_id;
+ item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+ item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags);
+ item.info.dev = dev;
+
+ err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item);
+ return notifier_to_errno(err);
+}
+
+int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
+ struct notifier_block *nb)
+{
+ struct net_bridge_fdb_entry *fdb;
+ struct net_bridge *br;
+ int err = 0;
+
+ if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev))
+ return -EINVAL;
+
+ br = netdev_priv(br_dev);
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) {
+ struct net_bridge_port *dst = READ_ONCE(fdb->dst);
+ struct net_device *dst_dev;
+
+ dst_dev = dst ? dst->dev : br->dev;
+ if (dst_dev != br_dev && dst_dev != dev)
+ continue;
+
+ err = br_fdb_replay_one(nb, fdb, dst_dev);
+ if (err)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(br_fdb_replay);
+
static void fdb_notify(struct net_bridge *br,
const struct net_bridge_fdb_entry *fdb, int type,
bool swdev_notify)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 222285d9dae2..8875e953ac53 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -144,6 +144,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
break;
case BR_PKT_UNICAST:
dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid);
+ break;
default:
break;
}
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 8846c5bcd075..95fa4af0e8dd 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -506,6 +506,134 @@ err:
kfree(priv);
}
+static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb,
+ const struct net_bridge_mdb_entry *mp)
+{
+ if (mp->addr.proto == htons(ETH_P_IP))
+ ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (mp->addr.proto == htons(ETH_P_IPV6))
+ ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr);
+#endif
+ else
+ ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr);
+
+ mdb->vid = mp->addr.vid;
+}
+
+static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
+ struct switchdev_obj_port_mdb *mdb,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_notifier_port_obj_info obj_info = {
+ .info = {
+ .dev = dev,
+ .extack = extack,
+ },
+ .obj = &mdb->obj,
+ };
+ int err;
+
+ err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info);
+ return notifier_to_errno(err);
+}
+
+static int br_mdb_queue_one(struct list_head *mdb_list,
+ enum switchdev_obj_id id,
+ const struct net_bridge_mdb_entry *mp,
+ struct net_device *orig_dev)
+{
+ struct switchdev_obj_port_mdb *mdb;
+
+ mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC);
+ if (!mdb)
+ return -ENOMEM;
+
+ mdb->obj.id = id;
+ mdb->obj.orig_dev = orig_dev;
+ br_switchdev_mdb_populate(mdb, mp);
+ list_add_tail(&mdb->obj.list, mdb_list);
+
+ return 0;
+}
+
+int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
+ struct notifier_block *nb, struct netlink_ext_ack *extack)
+{
+ struct net_bridge_mdb_entry *mp;
+ struct switchdev_obj *obj, *tmp;
+ struct net_bridge *br;
+ LIST_HEAD(mdb_list);
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev))
+ return -EINVAL;
+
+ br = netdev_priv(br_dev);
+
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ return 0;
+
+ /* We cannot walk over br->mdb_list protected just by the rtnl_mutex,
+ * because the write-side protection is br->multicast_lock. But we
+ * need to emulate the [ blocking ] calling context of a regular
+ * switchdev event, so since both br->multicast_lock and RCU read side
+ * critical sections are atomic, we have no choice but to pick the RCU
+ * read side lock, queue up all our events, leave the critical section
+ * and notify switchdev from blocking context.
+ */
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+
+ if (mp->host_joined) {
+ err = br_mdb_queue_one(&mdb_list,
+ SWITCHDEV_OBJ_ID_HOST_MDB,
+ mp, br_dev);
+ if (err) {
+ rcu_read_unlock();
+ goto out_free_mdb;
+ }
+ }
+
+ for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
+ pp = &p->next) {
+ if (p->key.port->dev != dev)
+ continue;
+
+ err = br_mdb_queue_one(&mdb_list,
+ SWITCHDEV_OBJ_ID_PORT_MDB,
+ mp, dev);
+ if (err) {
+ rcu_read_unlock();
+ goto out_free_mdb;
+ }
+ }
+ }
+
+ rcu_read_unlock();
+
+ list_for_each_entry(obj, &mdb_list, list) {
+ err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj),
+ extack);
+ if (err)
+ goto out_free_mdb;
+ }
+
+out_free_mdb:
+ list_for_each_entry_safe(obj, tmp, &mdb_list, list) {
+ list_del(&obj->list);
+ kfree(SWITCHDEV_OBJ_PORT_MDB(obj));
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(br_mdb_replay);
+
static void br_mdb_switchdev_host_port(struct net_device *dev,
struct net_device *lower_dev,
struct net_bridge_mdb_entry *mp,
@@ -515,18 +643,12 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
.obj = {
.id = SWITCHDEV_OBJ_ID_HOST_MDB,
.flags = SWITCHDEV_F_DEFER,
+ .orig_dev = dev,
},
- .vid = mp->addr.vid,
};
- if (mp->addr.proto == htons(ETH_P_IP))
- ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr);
-#if IS_ENABLED(CONFIG_IPV6)
- else
- ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr);
-#endif
+ br_switchdev_mdb_populate(&mdb, mp);
- mdb.obj.orig_dev = dev;
switch (type) {
case RTM_NEWMDB:
switchdev_port_obj_add(lower_dev, &mdb.obj, NULL);
@@ -558,21 +680,13 @@ void br_mdb_notify(struct net_device *dev,
.id = SWITCHDEV_OBJ_ID_PORT_MDB,
.flags = SWITCHDEV_F_DEFER,
},
- .vid = mp->addr.vid,
};
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
if (pg) {
- if (mp->addr.proto == htons(ETH_P_IP))
- ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr);
-#if IS_ENABLED(CONFIG_IPV6)
- else if (mp->addr.proto == htons(ETH_P_IPV6))
- ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr);
-#endif
- else
- ether_addr_copy(mdb.addr, mp->addr.dst.mac_addr);
+ br_switchdev_mdb_populate(&mdb, mp);
mdb.obj.orig_dev = pg->key.port->dev;
switch (type) {
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index 12487f6fe9b4..cd2b1e424e54 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -411,6 +411,13 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
cancel_delayed_work_sync(&mrp->in_test_work);
br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0);
+ /* Disable the roles */
+ br_mrp_switchdev_set_ring_role(br, mrp, BR_MRP_RING_ROLE_DISABLED);
+ p = rtnl_dereference(mrp->i_port);
+ if (p)
+ br_mrp_switchdev_set_in_role(br, mrp, mrp->in_id, mrp->ring_id,
+ BR_MRP_IN_ROLE_DISABLED);
+
br_mrp_switchdev_del(br, mrp);
/* Reset the ports */
diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c
index fea38b9a7268..13290a749d09 100644
--- a/net/bridge/br_multicast_eht.c
+++ b/net/bridge/br_multicast_eht.c
@@ -498,11 +498,13 @@ static void br_multicast_del_eht_host(struct net_bridge_port_group *pg,
&set_h->h_addr);
}
-static void __eht_allow_incl(struct net_bridge_port_group *pg,
- union net_bridge_eht_addr *h_addr,
- void *srcs,
- u32 nsrcs,
- size_t addr_size)
+/* create new set entries from reports */
+static void __eht_create_set_entries(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ int filter_mode)
{
union net_bridge_eht_addr eht_src_addr;
u32 src_idx;
@@ -511,72 +513,17 @@ static void __eht_allow_incl(struct net_bridge_port_group *pg,
for (src_idx = 0; src_idx < nsrcs; src_idx++) {
memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size);
br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr,
- MCAST_INCLUDE,
+ filter_mode,
false);
}
}
-static bool __eht_allow_excl(struct net_bridge_port_group *pg,
- union net_bridge_eht_addr *h_addr,
- void *srcs,
- u32 nsrcs,
- size_t addr_size)
-{
- bool changed = false, host_excl = false;
- union net_bridge_eht_addr eht_src_addr;
- struct net_bridge_group_src *src_ent;
- struct br_ip src_ip;
- u32 src_idx;
-
- host_excl = !!(br_multicast_eht_host_filter_mode(pg, h_addr) == MCAST_EXCLUDE);
- memset(&eht_src_addr, 0, sizeof(eht_src_addr));
- for (src_idx = 0; src_idx < nsrcs; src_idx++) {
- memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size);
- if (!host_excl) {
- br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr,
- MCAST_INCLUDE,
- false);
- } else {
- if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr,
- h_addr))
- continue;
- memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size);
- src_ent = br_multicast_find_group_src(pg, &src_ip);
- if (!src_ent)
- continue;
- br_multicast_del_group_src(src_ent, true);
- changed = true;
- }
- }
-
- return changed;
-}
-
-static bool br_multicast_eht_allow(struct net_bridge_port_group *pg,
- union net_bridge_eht_addr *h_addr,
- void *srcs,
- u32 nsrcs,
- size_t addr_size)
-{
- bool changed = false;
-
- switch (br_multicast_eht_host_filter_mode(pg, h_addr)) {
- case MCAST_INCLUDE:
- __eht_allow_incl(pg, h_addr, srcs, nsrcs, addr_size);
- break;
- case MCAST_EXCLUDE:
- changed = __eht_allow_excl(pg, h_addr, srcs, nsrcs, addr_size);
- break;
- }
-
- return changed;
-}
-
-static bool __eht_block_incl(struct net_bridge_port_group *pg,
- union net_bridge_eht_addr *h_addr,
- void *srcs,
- u32 nsrcs,
- size_t addr_size)
+/* delete existing set entries and their (S,G) entries if they were the last */
+static bool __eht_del_set_entries(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size)
{
union net_bridge_eht_addr eht_src_addr;
struct net_bridge_group_src *src_ent;
@@ -602,39 +549,23 @@ static bool __eht_block_incl(struct net_bridge_port_group *pg,
return changed;
}
-static bool __eht_block_excl(struct net_bridge_port_group *pg,
- union net_bridge_eht_addr *h_addr,
- void *srcs,
- u32 nsrcs,
- size_t addr_size)
+static bool br_multicast_eht_allow(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size)
{
- bool changed = false, host_excl = false;
- union net_bridge_eht_addr eht_src_addr;
- struct net_bridge_group_src *src_ent;
- struct br_ip src_ip;
- u32 src_idx;
+ bool changed = false;
- host_excl = !!(br_multicast_eht_host_filter_mode(pg, h_addr) == MCAST_EXCLUDE);
- memset(&eht_src_addr, 0, sizeof(eht_src_addr));
- memset(&src_ip, 0, sizeof(src_ip));
- src_ip.proto = pg->key.addr.proto;
- for (src_idx = 0; src_idx < nsrcs; src_idx++) {
- memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size);
- if (host_excl) {
- br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr,
- MCAST_EXCLUDE,
- false);
- } else {
- if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr,
- h_addr))
- continue;
- memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size);
- src_ent = br_multicast_find_group_src(pg, &src_ip);
- if (!src_ent)
- continue;
- br_multicast_del_group_src(src_ent, true);
- changed = true;
- }
+ switch (br_multicast_eht_host_filter_mode(pg, h_addr)) {
+ case MCAST_INCLUDE:
+ __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size,
+ MCAST_INCLUDE);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs,
+ addr_size);
+ break;
}
return changed;
@@ -650,10 +581,12 @@ static bool br_multicast_eht_block(struct net_bridge_port_group *pg,
switch (br_multicast_eht_host_filter_mode(pg, h_addr)) {
case MCAST_INCLUDE:
- changed = __eht_block_incl(pg, h_addr, srcs, nsrcs, addr_size);
+ changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs,
+ addr_size);
break;
case MCAST_EXCLUDE:
- changed = __eht_block_excl(pg, h_addr, srcs, nsrcs, addr_size);
+ __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size,
+ MCAST_EXCLUDE);
break;
}
@@ -671,7 +604,6 @@ static bool __eht_inc_exc(struct net_bridge_port_group *pg,
{
bool changed = false, flush_entries = to_report;
union net_bridge_eht_addr eht_src_addr;
- u32 src_idx;
if (br_multicast_eht_host_filter_mode(pg, h_addr) != filter_mode)
flush_entries = true;
@@ -680,11 +612,8 @@ static bool __eht_inc_exc(struct net_bridge_port_group *pg,
/* if we're changing mode del host and its entries */
if (flush_entries)
br_multicast_del_eht_host(pg, h_addr);
- for (src_idx = 0; src_idx < nsrcs; src_idx++) {
- memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size);
- br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr,
- filter_mode, false);
- }
+ __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size,
+ filter_mode);
/* we can be missing sets only if we've deleted some entries */
if (flush_entries) {
struct net_bridge *br = pg->key.port->br;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index d7d167e10b70..50747990188e 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1118,6 +1118,13 @@ void br_vlan_notify(const struct net_bridge *br,
bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
const struct net_bridge_vlan *range_end);
+void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path);
+int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path);
+
static inline struct net_bridge_vlan_group *br_vlan_group(
const struct net_bridge *br)
{
@@ -1277,6 +1284,19 @@ static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p,
return 0;
}
+static inline void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+}
+
+static inline int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path)
+{
+ return 0;
+}
+
static inline struct net_bridge_vlan_group *br_vlan_group(
const struct net_bridge *br)
{
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 21c6781906aa..3dafb6143cff 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -64,6 +64,20 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
}
}
+u8 br_port_get_stp_state(const struct net_device *dev)
+{
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return BR_STATE_DISABLED;
+
+ return p->state;
+}
+EXPORT_SYMBOL_GPL(br_port_get_stp_state);
+
/* called under bridge lock */
struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no)
{
@@ -625,6 +639,19 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
return 0;
}
+clock_t br_get_ageing_time(struct net_device *br_dev)
+{
+ struct net_bridge *br;
+
+ if (!netif_is_bridge_master(br_dev))
+ return 0;
+
+ br = netdev_priv(br_dev);
+
+ return jiffies_to_clock_t(br->ageing_time);
+}
+EXPORT_SYMBOL_GPL(br_get_ageing_time);
+
/* called under bridge lock */
void __br_set_topology_change(struct net_bridge *br, unsigned char val)
{
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 8829f621b8ec..da3256a3eed0 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1339,6 +1339,61 @@ int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
}
EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu);
+void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct net_bridge_vlan_group *vg;
+ int idx = ctx->num_vlans - 1;
+ u16 vid;
+
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+ return;
+
+ vg = br_vlan_group(br);
+
+ if (idx >= 0 &&
+ ctx->vlan[idx].proto == br->vlan_proto) {
+ vid = ctx->vlan[idx].id;
+ } else {
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG;
+ vid = br_get_pvid(vg);
+ }
+
+ path->bridge.vlan_id = vid;
+ path->bridge.vlan_proto = br->vlan_proto;
+}
+
+int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+ return 0;
+
+ vg = nbp_vlan_group_rcu(dst);
+ v = br_vlan_find(vg, path->bridge.vlan_id);
+ if (!v || !br_vlan_should_use(v))
+ return -EINVAL;
+
+ if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
+ return 0;
+
+ if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG)
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+ else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW;
+ else
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG;
+
+ return 0;
+}
+
int br_vlan_get_info(const struct net_device *dev, u16 vid,
struct bridge_vlan_info *p_vinfo)
{
@@ -1751,6 +1806,79 @@ out_kfree:
kfree_skb(skb);
}
+static int br_vlan_replay_one(struct notifier_block *nb,
+ struct net_device *dev,
+ struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_notifier_port_obj_info obj_info = {
+ .info = {
+ .dev = dev,
+ .extack = extack,
+ },
+ .obj = &vlan->obj,
+ };
+ int err;
+
+ err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info);
+ return notifier_to_errno(err);
+}
+
+int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
+ struct notifier_block *nb, struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+ int err = 0;
+ u16 pvid;
+
+ ASSERT_RTNL();
+
+ if (!netif_is_bridge_master(br_dev))
+ return -EINVAL;
+
+ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev))
+ return -EINVAL;
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ p = NULL;
+ } else {
+ p = br_port_get_rtnl(dev);
+ if (WARN_ON(!p))
+ return -EINVAL;
+ vg = nbp_vlan_group(p);
+ br = p->br;
+ }
+
+ if (!vg)
+ return 0;
+
+ pvid = br_get_pvid(vg);
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ struct switchdev_obj_port_vlan vlan = {
+ .obj.orig_dev = dev,
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .flags = br_vlan_flags(v, pvid),
+ .vid = v->vid,
+ };
+
+ if (!br_vlan_should_use(v))
+ continue;
+
+ err = br_vlan_replay_one(nb, dev, &vlan, extack);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(br_vlan_replay);
+
/* check if v_curr can enter a range ending in range_end */
bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
const struct net_bridge_vlan *range_end)
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
index 169e005fbda2..0d3a8c01552e 100644
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -35,7 +35,7 @@ static const struct rhashtable_params br_vlan_tunnel_rht_params = {
};
static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl,
- u64 tunnel_id)
+ __be64 tunnel_id)
{
return rhashtable_lookup_fast(tbl, &tunnel_id,
br_vlan_tunnel_rht_params);
diff --git a/net/core/Makefile b/net/core/Makefile
index 3e2c378e5f31..0c2233c826fd 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,7 +16,6 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
obj-y += net-sysfs.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
-obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
@@ -28,10 +27,13 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
-obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
obj-$(CONFIG_FAILOVER) += failover.o
+ifeq ($(CONFIG_INET),y)
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
+obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
+endif
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 4edd033e899c..cc3712ad8716 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -89,7 +89,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
smap = (struct bpf_local_storage_map *)map;
bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap);
+ bpf_local_storage_map_free(smap, NULL);
}
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
diff --git a/net/core/dev.c b/net/core/dev.c
index 0f72ff5d34ba..48b529d59157 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -848,6 +848,52 @@ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
+{
+ int k = stack->num_paths++;
+
+ if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
+ return NULL;
+
+ return &stack->path[k];
+}
+
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ struct net_device_path_stack *stack)
+{
+ const struct net_device *last_dev;
+ struct net_device_path_ctx ctx = {
+ .dev = dev,
+ .daddr = daddr,
+ };
+ struct net_device_path *path;
+ int ret = 0;
+
+ stack->num_paths = 0;
+ while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
+ last_dev = ctx.dev;
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+
+ memset(path, 0, sizeof(struct net_device_path));
+ ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
+ if (ret < 0)
+ return -1;
+
+ if (WARN_ON_ONCE(last_dev == ctx.dev))
+ return -1;
+ }
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+ path->type = DEV_PATH_ETHERNET;
+ path->dev = ctx.dev;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_fill_forward_path);
+
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
@@ -2463,16 +2509,14 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
EXPORT_SYMBOL(netdev_txq_to_tc);
#ifdef CONFIG_XPS
-struct static_key xps_needed __read_mostly;
-EXPORT_SYMBOL(xps_needed);
-struct static_key xps_rxqs_needed __read_mostly;
-EXPORT_SYMBOL(xps_rxqs_needed);
+static struct static_key xps_needed __read_mostly;
+static struct static_key xps_rxqs_needed __read_mostly;
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
- int tci, u16 index)
+ struct xps_dev_maps *old_maps, int tci, u16 index)
{
struct xps_map *map = NULL;
int pos;
@@ -2491,6 +2535,8 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
break;
}
+ if (old_maps)
+ RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
@@ -2503,7 +2549,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
struct xps_dev_maps *dev_maps,
int cpu, u16 offset, u16 count)
{
- int num_tc = dev->num_tc ? : 1;
+ int num_tc = dev_maps->num_tc;
bool active = false;
int tci;
@@ -2511,7 +2557,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
int i, j;
for (i = count, j = offset; i--; j++) {
- if (!remove_xps_queue(dev_maps, tci, j))
+ if (!remove_xps_queue(dev_maps, NULL, tci, j))
break;
}
@@ -2523,74 +2569,54 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
static void reset_xps_maps(struct net_device *dev,
struct xps_dev_maps *dev_maps,
- bool is_rxqs_map)
+ enum xps_map_type type)
{
- if (is_rxqs_map) {
- static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
- RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
- } else {
- RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
- }
static_key_slow_dec_cpuslocked(&xps_needed);
+ if (type == XPS_RXQS)
+ static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+
+ RCU_INIT_POINTER(dev->xps_maps[type], NULL);
+
kfree_rcu(dev_maps, rcu);
}
-static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
- struct xps_dev_maps *dev_maps, unsigned int nr_ids,
- u16 offset, u16 count, bool is_rxqs_map)
+static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
+ u16 offset, u16 count)
{
+ struct xps_dev_maps *dev_maps;
bool active = false;
int i, j;
- for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
- j < nr_ids;)
- active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
- count);
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (!dev_maps)
+ return;
+
+ for (j = 0; j < dev_maps->nr_ids; j++)
+ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
if (!active)
- reset_xps_maps(dev, dev_maps, is_rxqs_map);
+ reset_xps_maps(dev, dev_maps, type);
- if (!is_rxqs_map) {
- for (i = offset + (count - 1); count--; i--) {
+ if (type == XPS_CPUS) {
+ for (i = offset + (count - 1); count--; i--)
netdev_queue_numa_node_write(
- netdev_get_tx_queue(dev, i),
- NUMA_NO_NODE);
- }
+ netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
}
}
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
u16 count)
{
- const unsigned long *possible_mask = NULL;
- struct xps_dev_maps *dev_maps;
- unsigned int nr_ids;
-
if (!static_key_false(&xps_needed))
return;
cpus_read_lock();
mutex_lock(&xps_map_mutex);
- if (static_key_false(&xps_rxqs_needed)) {
- dev_maps = xmap_dereference(dev->xps_rxqs_map);
- if (dev_maps) {
- nr_ids = dev->num_rx_queues;
- clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
- offset, count, true);
- }
- }
-
- dev_maps = xmap_dereference(dev->xps_cpus_map);
- if (!dev_maps)
- goto out_no_maps;
+ if (static_key_false(&xps_rxqs_needed))
+ clean_xps_maps(dev, XPS_RXQS, offset, count);
- if (num_possible_cpus() > 1)
- possible_mask = cpumask_bits(cpu_possible_mask);
- nr_ids = nr_cpu_ids;
- clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
- false);
+ clean_xps_maps(dev, XPS_CPUS, offset, count);
-out_no_maps:
mutex_unlock(&xps_map_mutex);
cpus_read_unlock();
}
@@ -2640,16 +2666,35 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
return new_map;
}
+/* Copy xps maps at a given index */
+static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
+ struct xps_dev_maps *new_dev_maps, int index,
+ int tc, bool skip_tc)
+{
+ int i, tci = index * dev_maps->num_tc;
+ struct xps_map *map;
+
+ /* copy maps belonging to foreign traffic classes */
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc && skip_tc)
+ continue;
+
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ }
+}
+
/* Must be called under cpus_read_lock */
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
- u16 index, bool is_rxqs_map)
+ u16 index, enum xps_map_type type)
{
- const unsigned long *online_mask = NULL, *possible_mask = NULL;
- struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
+ const unsigned long *online_mask = NULL;
+ bool active = false, copy = false;
int i, j, tci, numa_node_id = -2;
int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
- bool active = false;
unsigned int nr_ids;
if (dev->num_tc) {
@@ -2667,38 +2712,48 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
}
mutex_lock(&xps_map_mutex);
- if (is_rxqs_map) {
+
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (type == XPS_RXQS) {
maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
- dev_maps = xmap_dereference(dev->xps_rxqs_map);
nr_ids = dev->num_rx_queues;
} else {
maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
- if (num_possible_cpus() > 1) {
+ if (num_possible_cpus() > 1)
online_mask = cpumask_bits(cpu_online_mask);
- possible_mask = cpumask_bits(cpu_possible_mask);
- }
- dev_maps = xmap_dereference(dev->xps_cpus_map);
nr_ids = nr_cpu_ids;
}
if (maps_sz < L1_CACHE_BYTES)
maps_sz = L1_CACHE_BYTES;
+ /* The old dev_maps could be larger or smaller than the one we're
+ * setting up now, as dev->num_tc or nr_ids could have been updated in
+ * between. We could try to be smart, but let's be safe instead and only
+ * copy foreign traffic classes if the two map sizes match.
+ */
+ if (dev_maps &&
+ dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
+ copy = true;
+
/* allocate memory for queue storage */
for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
j < nr_ids;) {
- if (!new_dev_maps)
- new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
- mutex_unlock(&xps_map_mutex);
- return -ENOMEM;
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps) {
+ mutex_unlock(&xps_map_mutex);
+ return -ENOMEM;
+ }
+
+ new_dev_maps->nr_ids = nr_ids;
+ new_dev_maps->num_tc = num_tc;
}
tci = j * num_tc + tc;
- map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
- NULL;
+ map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
- map = expand_xps_map(map, j, index, is_rxqs_map);
+ map = expand_xps_map(map, j, index, type == XPS_RXQS);
if (!map)
goto error;
@@ -2711,29 +2766,21 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
if (!dev_maps) {
/* Increment static keys at most once per type */
static_key_slow_inc_cpuslocked(&xps_needed);
- if (is_rxqs_map)
+ if (type == XPS_RXQS)
static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
}
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- /* copy maps belonging to foreign traffic classes */
- for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- }
+ for (j = 0; j < nr_ids; j++) {
+ bool skip_tc = false;
- /* We need to explicitly update tci as prevous loop
- * could break out early if dev_maps is NULL.
- */
tci = j * num_tc + tc;
-
if (netif_attr_test_mask(j, mask, nr_ids) &&
netif_attr_test_online(j, online_mask, nr_ids)) {
/* add tx-queue to CPU/rx-queue maps */
int pos = 0;
+ skip_tc = true;
+
map = xmap_dereference(new_dev_maps->attr_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
@@ -2741,78 +2788,81 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
if (pos == map->len)
map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
- if (!is_rxqs_map) {
+ if (type == XPS_CPUS) {
if (numa_node_id == -2)
numa_node_id = cpu_to_node(j);
else if (numa_node_id != cpu_to_node(j))
numa_node_id = -1;
}
#endif
- } else if (dev_maps) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
- /* copy maps belonging to foreign traffic classes */
- for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- }
+ if (copy)
+ xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
+ skip_tc);
}
- if (is_rxqs_map)
- rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
- else
- rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
+ rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
/* Cleanup old maps */
if (!dev_maps)
goto out_no_old_maps;
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- for (i = num_tc, tci = j * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
map = xmap_dereference(dev_maps->attr_map[tci]);
- if (map && map != new_map)
- kfree_rcu(map, rcu);
+ if (!map)
+ continue;
+
+ if (copy) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ if (map == new_map)
+ continue;
+ }
+
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
+ kfree_rcu(map, rcu);
}
}
- kfree_rcu(dev_maps, rcu);
+ old_dev_maps = dev_maps;
out_no_old_maps:
dev_maps = new_dev_maps;
active = true;
out_no_new_maps:
- if (!is_rxqs_map) {
+ if (type == XPS_CPUS)
/* update Tx queue numa node */
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
(numa_node_id >= 0) ?
numa_node_id : NUMA_NO_NODE);
- }
if (!dev_maps)
goto out_no_maps;
/* removes tx-queue from unused CPUs/rx-queues */
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- for (i = tc, tci = j * num_tc; i--; tci++)
- active |= remove_xps_queue(dev_maps, tci, index);
- if (!netif_attr_test_mask(j, mask, nr_ids) ||
- !netif_attr_test_online(j, online_mask, nr_ids))
- active |= remove_xps_queue(dev_maps, tci, index);
- for (i = num_tc - tc, tci++; --i; tci++)
- active |= remove_xps_queue(dev_maps, tci, index);
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ tci = j * dev_maps->num_tc;
+
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc &&
+ netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
+ netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
+ continue;
+
+ active |= remove_xps_queue(dev_maps,
+ copy ? old_dev_maps : NULL,
+ tci, index);
+ }
}
+ if (old_dev_maps)
+ kfree_rcu(old_dev_maps, rcu);
+
/* free map if not active */
if (!active)
- reset_xps_maps(dev, dev_maps, is_rxqs_map);
+ reset_xps_maps(dev, dev_maps, type);
out_no_maps:
mutex_unlock(&xps_map_mutex);
@@ -2820,11 +2870,10 @@ out_no_maps:
return 0;
error:
/* remove any maps that we added */
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
+ for (j = 0; j < nr_ids; j++) {
for (i = num_tc, tci = j * num_tc; i--; tci++) {
new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
- map = dev_maps ?
+ map = copy ?
xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
if (new_map && new_map != map)
@@ -2845,7 +2894,7 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
int ret;
cpus_read_lock();
- ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+ ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
cpus_read_unlock();
return ret;
@@ -3956,13 +4005,15 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
struct xps_dev_maps *dev_maps, unsigned int tci)
{
+ int tc = netdev_get_prio_tc_map(dev, skb->priority);
struct xps_map *map;
int queue_index = -1;
- if (dev->num_tc) {
- tci *= dev->num_tc;
- tci += netdev_get_prio_tc_map(dev, skb->priority);
- }
+ if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
+ return queue_index;
+
+ tci *= dev_maps->num_tc;
+ tci += tc;
map = rcu_dereference(dev_maps->attr_map[tci]);
if (map) {
@@ -3993,18 +4044,18 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
if (!static_key_false(&xps_rxqs_needed))
goto get_cpus_map;
- dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
if (dev_maps) {
int tci = sk_rx_queue_get(sk);
- if (tci >= 0 && tci < dev->num_rx_queues)
+ if (tci >= 0)
queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
tci);
}
get_cpus_map:
if (queue_index < 0) {
- dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
if (dev_maps) {
unsigned int tci = skb->sender_cpu - 1;
@@ -5284,6 +5335,7 @@ skip_classify:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
+ break;
case RX_HANDLER_PASS:
break;
default:
@@ -5876,15 +5928,13 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
}
EXPORT_SYMBOL(napi_gro_flush);
-static struct list_head *gro_list_prepare(struct napi_struct *napi,
- struct sk_buff *skb)
+static void gro_list_prepare(const struct list_head *head,
+ const struct sk_buff *skb)
{
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
- struct list_head *head;
struct sk_buff *p;
- head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
list_for_each_entry(p, head, list) {
unsigned long diffs;
@@ -5910,8 +5960,6 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi,
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
-
- return head;
}
static void skb_gro_reset_offset(struct sk_buff *skb)
@@ -5974,11 +6022,11 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
- u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct gro_list *gro_list = &napi->gro_hash[bucket];
struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
- struct list_head *gro_head;
struct sk_buff *pp = NULL;
enum gro_result ret;
int same_flow;
@@ -5987,7 +6035,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (netif_elide_gro(skb->dev))
goto normal;
- gro_head = gro_list_prepare(napi, skb);
+ gro_list_prepare(&gro_list->list, skb);
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
@@ -6023,7 +6071,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
ipv6_gro_receive, inet_gro_receive,
- gro_head, skb);
+ &gro_list->list, skb);
break;
}
rcu_read_unlock();
@@ -6042,7 +6090,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (pp) {
skb_list_del_init(pp);
napi_gro_complete(napi, pp);
- napi->gro_hash[hash].count--;
+ gro_list->count--;
}
if (same_flow)
@@ -6051,16 +6099,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (NAPI_GRO_CB(skb)->flush)
goto normal;
- if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
- gro_flush_oldest(napi, gro_head);
- } else {
- napi->gro_hash[hash].count++;
- }
+ if (unlikely(gro_list->count >= MAX_GRO_SKBS))
+ gro_flush_oldest(napi, &gro_list->list);
+ else
+ gro_list->count++;
+
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- list_add(&skb->list, gro_head);
+ list_add(&skb->list, &gro_list->list);
ret = GRO_HELD;
pull:
@@ -6068,11 +6116,11 @@ pull:
if (grow > 0)
gro_pull_from_frag0(skb, grow);
ok:
- if (napi->gro_hash[hash].count) {
- if (!test_bit(hash, &napi->gro_bitmask))
- __set_bit(hash, &napi->gro_bitmask);
- } else if (test_bit(hash, &napi->gro_bitmask)) {
- __clear_bit(hash, &napi->gro_bitmask);
+ if (gro_list->count) {
+ if (!test_bit(bucket, &napi->gro_bitmask))
+ __set_bit(bucket, &napi->gro_bitmask);
+ } else if (test_bit(bucket, &napi->gro_bitmask)) {
+ __clear_bit(bucket, &napi->gro_bitmask);
}
return ret;
@@ -6789,6 +6837,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
return err;
}
+EXPORT_SYMBOL(dev_set_threaded);
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
@@ -10336,14 +10385,20 @@ EXPORT_SYMBOL(register_netdev);
int netdev_refcnt_read(const struct net_device *dev)
{
+#ifdef CONFIG_PCPU_DEV_REFCNT
int i, refcnt = 0;
for_each_possible_cpu(i)
refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
return refcnt;
+#else
+ return refcount_read(&dev->dev_refcnt);
+#endif
}
EXPORT_SYMBOL(netdev_refcnt_read);
+int netdev_unregister_timeout_secs __read_mostly = 10;
+
#define WAIT_REFS_MIN_MSECS 1
#define WAIT_REFS_MAX_MSECS 250
/**
@@ -10368,7 +10423,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
rebroadcast_time = warning_time = jiffies;
refcnt = netdev_refcnt_read(dev);
- while (refcnt != 0) {
+ while (refcnt != 1) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
@@ -10405,7 +10460,9 @@ static void netdev_wait_allrefs(struct net_device *dev)
refcnt = netdev_refcnt_read(dev);
- if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
+ if (refcnt &&
+ time_after(jiffies, warning_time +
+ netdev_unregister_timeout_secs * HZ)) {
pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
dev->name, refcnt);
warning_time = jiffies;
@@ -10481,7 +10538,7 @@ void netdev_run_todo(void)
netdev_wait_allrefs(dev);
/* paranoia */
- BUG_ON(netdev_refcnt_read(dev));
+ BUG_ON(netdev_refcnt_read(dev) != 1);
BUG_ON(!list_empty(&dev->ptype_all));
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
@@ -10698,9 +10755,14 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
+#ifdef CONFIG_PCPU_DEV_REFCNT
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
+ dev_hold(dev);
+#else
+ refcount_set(&dev->dev_refcnt, 1);
+#endif
if (dev_addr_init(dev))
goto free_pcpu;
@@ -10764,8 +10826,10 @@ free_all:
return NULL;
free_pcpu:
+#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
free_dev:
+#endif
netdev_freemem(dev);
return NULL;
}
@@ -10807,8 +10871,10 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
+#endif
free_percpu(dev->xdp_bulkq);
dev->xdp_bulkq = NULL;
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index db65ce62b625..ead2a8aa57b4 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -1754,7 +1754,7 @@ static void exit_net_drop_monitor(void)
/*
* Because of the module_get/put we do in the trace state change path
- * we are guarnateed not to have any current users when we get here
+ * we are guaranteed not to have any current users when we get here
*/
for_each_possible_cpu(cpu) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 9323d34d34cc..f5eeebf6a16f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = {
static inline int sk_skb_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
- int err = __bpf_try_make_writable(skb, write_len);
-
- bpf_compute_data_end_sk_skb(skb);
- return err;
+ return __bpf_try_make_writable(skb, write_len);
}
BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -3412,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
BPF_F_ADJ_ROOM_ENCAP_L2( \
BPF_ADJ_ROOM_ENCAP_L2_MASK))
@@ -3448,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
return -EINVAL;
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+ inner_mac_len < ETH_HLEN)
+ return -EINVAL;
+
if (skb->encapsulation)
return -EALREADY;
@@ -3466,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
skb->inner_mac_header = inner_net - inner_mac_len;
skb->inner_network_header = inner_net;
skb->inner_transport_header = inner_trans;
- skb_set_inner_protocol(skb, skb->protocol);
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+ else
+ skb_set_inner_protocol(skb, skb->protocol);
skb->encapsulation = 1;
skb_set_network_header(skb, mac_len);
@@ -3577,7 +3583,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
return -ENOMEM;
__skb_pull(skb, len_diff_abs);
}
- bpf_compute_data_end_sk_skb(skb);
if (tls_sw_has_ctx_rx(skb->sk)) {
struct strp_msg *rxm = strp_msg(skb);
@@ -3742,10 +3747,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
u64, flags)
{
- int ret = __bpf_skb_change_tail(skb, new_len, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_tail(skb, new_len, flags);
}
static const struct bpf_func_proto sk_skb_change_tail_proto = {
@@ -3808,10 +3810,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
u64, flags)
{
- int ret = __bpf_skb_change_head(skb, head_room, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_head(skb, head_room, flags);
}
static const struct bpf_func_proto sk_skb_change_head_proto = {
@@ -3919,23 +3918,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.arg2_type = ARG_ANYTHING,
};
-static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
- struct bpf_map *map, struct xdp_buff *xdp)
-{
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- case BPF_MAP_TYPE_DEVMAP_HASH:
- return dev_map_enqueue(fwd, xdp, dev_rx);
- case BPF_MAP_TYPE_CPUMAP:
- return cpu_map_enqueue(fwd, xdp, dev_rx);
- case BPF_MAP_TYPE_XSKMAP:
- return __xsk_map_redirect(fwd, xdp);
- default:
- return -EBADRQC;
- }
- return 0;
-}
-
void xdp_do_flush(void)
{
__dev_flush();
@@ -3944,71 +3926,52 @@ void xdp_do_flush(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
-static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
-{
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- return __dev_map_lookup_elem(map, index);
- case BPF_MAP_TYPE_DEVMAP_HASH:
- return __dev_map_hash_lookup_elem(map, index);
- case BPF_MAP_TYPE_CPUMAP:
- return __cpu_map_lookup_elem(map, index);
- case BPF_MAP_TYPE_XSKMAP:
- return __xsk_map_lookup_elem(map, index);
- default:
- return NULL;
- }
-}
-
-void bpf_clear_redirect_map(struct bpf_map *map)
-{
- struct bpf_redirect_info *ri;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- ri = per_cpu_ptr(&bpf_redirect_info, cpu);
- /* Avoid polluting remote cacheline due to writes if
- * not needed. Once we pass this test, we need the
- * cmpxchg() to make sure it hasn't been changed in
- * the meantime by remote CPU.
- */
- if (unlikely(READ_ONCE(ri->map) == map))
- cmpxchg(&ri->map, map, NULL);
- }
-}
-
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->tgt_index;
+ enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
int err;
- ri->tgt_index = 0;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- if (unlikely(!map)) {
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ err = dev_map_enqueue(fwd, xdp, dev);
+ break;
+ case BPF_MAP_TYPE_CPUMAP:
+ err = cpu_map_enqueue(fwd, xdp, dev);
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ err = __xsk_map_redirect(fwd, xdp);
+ break;
+ case BPF_MAP_TYPE_UNSPEC:
+ if (map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ break;
+ }
+ err = dev_xdp_enqueue(fwd, xdp, dev);
+ break;
}
-
- err = dev_xdp_enqueue(fwd, xdp, dev);
- } else {
- err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
+ fallthrough;
+ default:
+ err = -EBADRQC;
}
if (unlikely(err))
goto err;
- _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@ -4017,41 +3980,36 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
struct xdp_buff *xdp,
struct bpf_prog *xdp_prog,
- struct bpf_map *map)
+ void *fwd,
+ enum bpf_map_type map_type, u32 map_id)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- u32 index = ri->tgt_index;
- void *fwd = ri->tgt_value;
- int err = 0;
-
- ri->tgt_index = 0;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
-
- if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
- map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- struct bpf_dtab_netdev *dst = fwd;
+ int err;
- err = dev_map_generic_redirect(dst, skb, xdp_prog);
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ err = dev_map_generic_redirect(fwd, skb, xdp_prog);
if (unlikely(err))
goto err;
- } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
- struct xdp_sock *xs = fwd;
-
- err = xsk_generic_rcv(xs, xdp);
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ err = xsk_generic_rcv(fwd, xdp);
if (err)
goto err;
consume_skb(skb);
- } else {
+ break;
+ default:
/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
err = -EBADRQC;
goto err;
}
- _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
@@ -4059,31 +4017,34 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->tgt_index;
- struct net_device *fwd;
- int err = 0;
-
- if (map)
- return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
- map);
- ri->tgt_index = 0;
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
+ int err;
- err = xdp_ok_fwd_dev(fwd, skb->len);
- if (unlikely(err))
- goto err;
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- skb->dev = fwd;
- _trace_xdp_redirect(dev, xdp_prog, index);
- generic_xdp_tx(skb, xdp_prog);
- return 0;
+ if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = xdp_ok_fwd_dev(fwd, skb->len);
+ if (unlikely(err))
+ goto err;
+
+ skb->dev = fwd;
+ _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
+ generic_xdp_tx(skb, xdp_prog);
+ return 0;
+ }
+
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
err:
- _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
return err;
}
@@ -4094,10 +4055,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
if (unlikely(flags))
return XDP_ABORTED;
- ri->flags = flags;
+ /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
+ * by map_idr) is used for ifindex based XDP redirect.
+ */
ri->tgt_index = ifindex;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
return XDP_REDIRECT;
}
@@ -4113,28 +4076,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-
- /* Lower bits of the flags are used as return code on lookup failure */
- if (unlikely(flags > XDP_TX))
- return XDP_ABORTED;
-
- ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
- if (unlikely(!ri->tgt_value)) {
- /* If the lookup fails we want to clear out the state in the
- * redirect_info struct completely, so that if an eBPF program
- * performs multiple lookups, the last one always takes
- * precedence.
- */
- WRITE_ONCE(ri->map, NULL);
- return flags;
- }
-
- ri->flags = flags;
- ri->tgt_index = ifindex;
- WRITE_ONCE(ri->map, map);
-
- return XDP_REDIRECT;
+ return map->ops->map_redirect(map, ifindex, flags);
}
static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
@@ -9663,22 +9605,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+/* data_end = skb->data + skb_headlen() */
+static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ /* si->dst_reg = skb->data */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, data));
+ /* AX = skb->len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, len));
+ /* si->dst_reg = skb->data + skb->len */
+ *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+ /* AX = skb->data_len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, data_len));
+ /* si->dst_reg = skb->data + skb->len - skb->data_len */
+ *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX);
+
+ return insn;
+}
+
static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
- int off;
switch (si->off) {
case offsetof(struct __sk_buff, data_end):
- off = si->off;
- off -= offsetof(struct __sk_buff, data_end);
- off += offsetof(struct sk_buff, cb);
- off += offsetof(struct tcp_skb_cb, bpf.data_end);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
- si->src_reg, off);
+ insn = bpf_convert_data_end_access(si, insn);
break;
default:
return bpf_convert_ctx_access(type, si, insn_buf, prog,
@@ -10457,6 +10417,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
}
const struct bpf_prog_ops sk_lookup_prog_ops = {
+ .test_run = bpf_prog_test_run_sk_lookup,
};
const struct bpf_verifier_ops sk_lookup_verifier_ops = {
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a96a4f5de0ce..5985029e43d4 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -114,7 +114,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net,
* is the protocol port offset returned from proto_ports_offset
*/
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
- void *data, int hlen)
+ const void *data, int hlen)
{
int poff = proto_ports_offset(ip_proto);
@@ -161,7 +161,7 @@ static bool icmp_has_id(u8 type)
*/
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
struct flow_dissector_key_icmp *key_icmp,
- void *data, int thoff, int hlen)
+ const void *data, int thoff, int hlen)
{
struct icmphdr *ih, _ih;
@@ -187,8 +187,8 @@ EXPORT_SYMBOL(skb_flow_get_icmp_tci);
*/
static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- void *data, int thoff, int hlen)
+ void *target_container, const void *data,
+ int thoff, int hlen)
{
struct flow_dissector_key_icmp *key_icmp;
@@ -409,8 +409,8 @@ EXPORT_SYMBOL(skb_flow_dissect_hash);
static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen,
- int lse_index, bool *entropy_label)
+ void *target_container, const void *data, int nhoff,
+ int hlen, int lse_index, bool *entropy_label)
{
struct mpls_label *hdr, _hdr;
u32 entry, label, bos;
@@ -467,7 +467,8 @@ __skb_flow_dissect_mpls(const struct sk_buff *skb,
static enum flow_dissect_ret
__skb_flow_dissect_arp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen)
+ void *target_container, const void *data,
+ int nhoff, int hlen)
{
struct flow_dissector_key_arp *key_arp;
struct {
@@ -523,7 +524,7 @@ static enum flow_dissect_ret
__skb_flow_dissect_gre(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
struct flow_dissector *flow_dissector,
- void *target_container, void *data,
+ void *target_container, const void *data,
__be16 *p_proto, int *p_nhoff, int *p_hlen,
unsigned int flags)
{
@@ -663,8 +664,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
static enum flow_dissect_ret
__skb_flow_dissect_batadv(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
- void *data, __be16 *p_proto, int *p_nhoff, int hlen,
- unsigned int flags)
+ const void *data, __be16 *p_proto, int *p_nhoff,
+ int hlen, unsigned int flags)
{
struct {
struct batadv_unicast_packet batadv_unicast;
@@ -695,7 +696,8 @@ __skb_flow_dissect_batadv(const struct sk_buff *skb,
static void
__skb_flow_dissect_tcp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int thoff, int hlen)
+ void *target_container, const void *data,
+ int thoff, int hlen)
{
struct flow_dissector_key_tcp *key_tcp;
struct tcphdr *th, _th;
@@ -719,8 +721,8 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb,
static void
__skb_flow_dissect_ports(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff,
- u8 ip_proto, int hlen)
+ void *target_container, const void *data,
+ int nhoff, u8 ip_proto, int hlen)
{
enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
struct flow_dissector_key_ports *key_ports;
@@ -744,7 +746,8 @@ __skb_flow_dissect_ports(const struct sk_buff *skb,
static void
__skb_flow_dissect_ipv4(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, const struct iphdr *iph)
+ void *target_container, const void *data,
+ const struct iphdr *iph)
{
struct flow_dissector_key_ip *key_ip;
@@ -761,7 +764,8 @@ __skb_flow_dissect_ipv4(const struct sk_buff *skb,
static void
__skb_flow_dissect_ipv6(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, const struct ipv6hdr *iph)
+ void *target_container, const void *data,
+ const struct ipv6hdr *iph)
{
struct flow_dissector_key_ip *key_ip;
@@ -908,9 +912,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- void *data, __be16 proto, int nhoff, int hlen,
- unsigned int flags)
+ void *target_container, const void *data,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
{
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
@@ -1642,7 +1645,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb,
}
EXPORT_SYMBOL(skb_get_hash_perturb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
const struct flow_keys_basic *keys, int hlen)
{
u32 poff = keys->control.thoff;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index c714e6a9dad4..d8b9dbabd4a4 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -10,9 +10,6 @@
#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
-extern struct list_head ptype_all __read_mostly;
-extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-
static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
{
struct net *net = seq_file_net(seq);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 307628fdf380..f6197774048b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1361,83 +1361,94 @@ static const struct attribute_group dql_group = {
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
-static ssize_t xps_cpus_show(struct netdev_queue *queue,
- char *buf)
+static ssize_t xps_queue_show(struct net_device *dev, unsigned int index,
+ int tc, char *buf, enum xps_map_type type)
{
- int cpu, len, ret, num_tc = 1, tc = 0;
- struct net_device *dev = queue->dev;
struct xps_dev_maps *dev_maps;
- cpumask_var_t mask;
- unsigned long index;
-
- if (!netif_is_multiqueue(dev))
- return -ENOENT;
+ unsigned long *mask;
+ unsigned int nr_ids;
+ int j, len;
- index = get_netdev_queue_index(queue);
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps[type]);
- if (!rtnl_trylock())
- return restart_syscall();
+ /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0
+ * when dev_maps hasn't been allocated yet, to be backward compatible.
+ */
+ nr_ids = dev_maps ? dev_maps->nr_ids :
+ (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues);
- if (dev->num_tc) {
- /* Do not allow XPS on subordinate device directly */
- num_tc = dev->num_tc;
- if (num_tc < 0) {
- ret = -EINVAL;
- goto err_rtnl_unlock;
- }
+ mask = bitmap_zalloc(nr_ids, GFP_NOWAIT);
+ if (!mask) {
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
- /* If queue belongs to subordinate dev use its map */
- dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+ if (!dev_maps || tc >= dev_maps->num_tc)
+ goto out_no_maps;
- tc = netdev_txq_to_tc(dev, index);
- if (tc < 0) {
- ret = -EINVAL;
- goto err_rtnl_unlock;
- }
- }
+ for (j = 0; j < nr_ids; j++) {
+ int i, tci = j * dev_maps->num_tc + tc;
+ struct xps_map *map;
- if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto err_rtnl_unlock;
- }
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ continue;
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_cpus_map);
- if (dev_maps) {
- for_each_possible_cpu(cpu) {
- int i, tci = cpu * num_tc + tc;
- struct xps_map *map;
-
- map = rcu_dereference(dev_maps->attr_map[tci]);
- if (!map)
- continue;
-
- for (i = map->len; i--;) {
- if (map->queues[i] == index) {
- cpumask_set_cpu(cpu, mask);
- break;
- }
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ set_bit(j, mask);
+ break;
}
}
}
+out_no_maps:
rcu_read_unlock();
- rtnl_unlock();
+ len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids);
+ bitmap_free(mask);
- len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
- free_cpumask_var(mask);
return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int index;
+ int len, tc;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
-err_rtnl_unlock:
+ index = get_netdev_queue_index(queue);
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ /* Make sure the subordinate device can't be freed */
+ get_device(&dev->dev);
rtnl_unlock();
- return ret;
+
+ len = xps_queue_show(dev, index, tc, buf, XPS_CPUS);
+
+ put_device(&dev->dev);
+ return len;
}
static ssize_t xps_cpus_store(struct netdev_queue *queue,
const char *buf, size_t len)
{
struct net_device *dev = queue->dev;
- unsigned long index;
+ unsigned int index;
cpumask_var_t mask;
int err;
@@ -1476,64 +1487,21 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
{
- int j, len, ret, num_tc = 1, tc = 0;
struct net_device *dev = queue->dev;
- struct xps_dev_maps *dev_maps;
- unsigned long *mask, index;
+ unsigned int index;
+ int tc;
index = get_netdev_queue_index(queue);
if (!rtnl_trylock())
return restart_syscall();
- if (dev->num_tc) {
- num_tc = dev->num_tc;
- tc = netdev_txq_to_tc(dev, index);
- if (tc < 0) {
- ret = -EINVAL;
- goto err_rtnl_unlock;
- }
- }
- mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
- if (!mask) {
- ret = -ENOMEM;
- goto err_rtnl_unlock;
- }
-
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_rxqs_map);
- if (!dev_maps)
- goto out_no_maps;
-
- for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
- j < dev->num_rx_queues;) {
- int i, tci = j * num_tc + tc;
- struct xps_map *map;
-
- map = rcu_dereference(dev_maps->attr_map[tci]);
- if (!map)
- continue;
-
- for (i = map->len; i--;) {
- if (map->queues[i] == index) {
- set_bit(j, mask);
- break;
- }
- }
- }
-out_no_maps:
- rcu_read_unlock();
-
+ tc = netdev_txq_to_tc(dev, index);
rtnl_unlock();
+ if (tc < 0)
+ return -EINVAL;
- len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
- bitmap_free(mask);
-
- return len < PAGE_SIZE ? len : -EINVAL;
-
-err_rtnl_unlock:
- rtnl_unlock();
- return ret;
+ return xps_queue_show(dev, index, tc, buf, XPS_RXQS);
}
static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
@@ -1541,7 +1509,8 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
{
struct net_device *dev = queue->dev;
struct net *net = dev_net(dev);
- unsigned long *mask, index;
+ unsigned long *mask;
+ unsigned int index;
int err;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -1565,7 +1534,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
}
cpus_read_lock();
- err = __netif_set_xps_queue(dev, mask, index, true);
+ err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS);
cpus_read_unlock();
rtnl_unlock();
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c421c8f80925..e8320b5d651a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3732,13 +3732,13 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
unsigned int tnl_hlen = skb_tnl_header_len(skb);
unsigned int delta_truesize = 0;
unsigned int delta_len = 0;
- struct sk_buff *tail = NULL;
struct sk_buff *nskb, *tmp;
int err;
skb_push(skb, -skb_network_offset(skb) + offset);
skb_shinfo(skb)->frag_list = NULL;
+ skb->next = list_skb;
do {
nskb = list_skb;
@@ -3756,17 +3756,8 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
}
}
- if (!tail)
- skb->next = nskb;
- else
- tail->next = nskb;
-
- if (unlikely(err)) {
- nskb->next = list_skb;
+ if (unlikely(err))
goto err_linearize;
- }
-
- tail = nskb;
delta_len += nskb->len;
delta_truesize += nskb->truesize;
@@ -3793,7 +3784,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
skb_gso_reset(skb);
- skb->prev = tail;
+ skb->prev = nskb;
if (skb_needs_linearize(skb, features) &&
__skb_linearize(skb))
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 1261512d6807..07f54015238a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work)
len = skb->len;
off = 0;
start:
- ingress = tcp_skb_bpf_ingress(skb);
+ ingress = skb_bpf_ingress(skb);
+ skb_bpf_redirect_clear(skb);
do {
ret = -EIO;
if (likely(psock->sk->sk_socket))
@@ -618,7 +619,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
return link;
}
-void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
{
struct sk_msg *msg, *tmp;
@@ -631,7 +632,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
static void sk_psock_zap_ingress(struct sk_psock *psock)
{
- __skb_queue_purge(&psock->ingress_skb);
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) {
+ skb_bpf_redirect_clear(skb);
+ kfree_skb(skb);
+ }
__sk_psock_purge_ingress_msg(psock);
}
@@ -645,15 +651,15 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
}
}
+static void sk_psock_done_strp(struct sk_psock *psock);
+
static void sk_psock_destroy_deferred(struct work_struct *gc)
{
struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
/* No sk_callback_lock since already detached. */
- /* Parser has been stopped */
- if (psock->progs.skb_parser)
- strp_done(&psock->parser.strp);
+ sk_psock_done_strp(psock);
cancel_work_sync(&psock->work);
@@ -685,9 +691,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
write_lock_bh(&sk->sk_callback_lock);
sk_psock_restore_proto(sk, psock);
rcu_assign_sk_user_data(sk, NULL);
- if (psock->progs.skb_parser)
+ if (psock->progs.stream_parser)
sk_psock_stop_strp(sk, psock);
- else if (psock->progs.skb_verdict)
+ else if (psock->progs.stream_verdict)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
@@ -743,27 +749,12 @@ out:
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
-static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
- struct sk_buff *skb)
-{
- bpf_compute_data_end_sk_skb(skb);
- return bpf_prog_run_pin_on_cpu(prog, skb);
-}
-
-static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
-{
- struct sk_psock_parser *parser;
-
- parser = container_of(strp, struct sk_psock_parser, strp);
- return container_of(parser, struct sk_psock, parser);
-}
-
static void sk_psock_skb_redirect(struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
- sk_other = tcp_skb_bpf_redirect_fetch(skb);
+ sk_other = skb_bpf_redirect_fetch(skb);
/* This error is a buggy BPF program, it returned a redirect
* return code, but then didn't set a redirect interface.
*/
@@ -806,16 +797,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
int ret = __SK_PASS;
rcu_read_lock();
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
/* We skip full set_owner_r here because if we do a SK_PASS
* or SK_DROP we can skip skb memory accounting and use the
* TLS context.
*/
skb->sk = psock->sk;
- tcp_skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_tls_verdict_apply(skb, psock->sk, ret);
@@ -827,7 +819,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
- struct tcp_skb_cb *tcp;
struct sock *sk_other;
int err = -EIO;
@@ -839,8 +830,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
goto out_free;
}
- tcp = TCP_SKB_CB(skb);
- tcp->bpf.flags |= BPF_F_INGRESS;
+ skb_bpf_set_ingress(skb);
/* If the queue is empty then we can submit directly
* into the msg queue. If its not empty we have to
@@ -866,6 +856,24 @@ out_free:
}
}
+static void sk_psock_write_space(struct sock *sk)
+{
+ struct sk_psock *psock;
+ void (*write_space)(struct sock *sk) = NULL;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_work(&psock->work);
+ write_space = psock->saved_write_space;
+ }
+ rcu_read_unlock();
+ if (write_space)
+ write_space(sk);
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
{
struct sk_psock *psock;
@@ -881,11 +889,12 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
goto out;
}
skb_set_owner_r(skb, sk);
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
- tcp_skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
}
sk_psock_verdict_apply(psock, skb, ret);
out:
@@ -899,15 +908,15 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err)
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
{
- struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
struct bpf_prog *prog;
int ret = skb->len;
rcu_read_lock();
- prog = READ_ONCE(psock->progs.skb_parser);
+ prog = READ_ONCE(psock->progs.stream_parser);
if (likely(prog)) {
skb->sk = psock->sk;
- ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
skb->sk = NULL;
}
rcu_read_unlock();
@@ -923,16 +932,59 @@ static void sk_psock_strp_data_ready(struct sock *sk)
psock = sk_psock(sk);
if (likely(psock)) {
if (tls_sw_has_ctx_rx(sk)) {
- psock->parser.saved_data_ready(sk);
+ psock->saved_data_ready(sk);
} else {
write_lock_bh(&sk->sk_callback_lock);
- strp_data_ready(&psock->parser.strp);
+ strp_data_ready(&psock->strp);
write_unlock_bh(&sk->sk_callback_lock);
}
}
rcu_read_unlock();
}
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+ static const struct strp_callbacks cb = {
+ .rcv_msg = sk_psock_strp_read,
+ .read_sock_done = sk_psock_strp_read_done,
+ .parse_msg = sk_psock_strp_parse,
+ };
+
+ return strp_init(&psock->strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+ if (psock->saved_data_ready)
+ return;
+
+ psock->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_strp_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+ if (!psock->saved_data_ready)
+ return;
+
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
+ strp_stop(&psock->strp);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+ /* Parser has been stopped */
+ if (psock->progs.stream_parser)
+ strp_done(&psock->strp);
+}
+#else
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t orig_len)
{
@@ -957,11 +1009,12 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
goto out;
}
skb_set_owner_r(skb, sk);
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
- tcp_skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
}
sk_psock_verdict_apply(psock, skb, ret);
out:
@@ -984,82 +1037,21 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
}
-static void sk_psock_write_space(struct sock *sk)
-{
- struct sk_psock *psock;
- void (*write_space)(struct sock *sk) = NULL;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (likely(psock)) {
- if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
- schedule_work(&psock->work);
- write_space = psock->saved_write_space;
- }
- rcu_read_unlock();
- if (write_space)
- write_space(sk);
-}
-
-int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
-{
- static const struct strp_callbacks cb = {
- .rcv_msg = sk_psock_strp_read,
- .read_sock_done = sk_psock_strp_read_done,
- .parse_msg = sk_psock_strp_parse,
- };
-
- psock->parser.enabled = false;
- return strp_init(&psock->parser.strp, sk, &cb);
-}
-
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (parser->enabled)
+ if (psock->saved_data_ready)
return;
- parser->saved_data_ready = sk->sk_data_ready;
+ psock->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_verdict_data_ready;
sk->sk_write_space = sk_psock_write_space;
- parser->enabled = true;
-}
-
-void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
-{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (parser->enabled)
- return;
-
- parser->saved_data_ready = sk->sk_data_ready;
- sk->sk_data_ready = sk_psock_strp_data_ready;
- sk->sk_write_space = sk_psock_write_space;
- parser->enabled = true;
-}
-
-void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
-{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (!parser->enabled)
- return;
-
- sk->sk_data_ready = parser->saved_data_ready;
- parser->saved_data_ready = NULL;
- strp_stop(&parser->strp);
- parser->enabled = false;
}
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (!parser->enabled)
+ if (!psock->saved_data_ready)
return;
- sk->sk_data_ready = parser->saved_data_ready;
- parser->saved_data_ready = NULL;
- parser->enabled = false;
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index d758fb83c884..dd53a7771d7e 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -24,6 +24,9 @@ struct bpf_stab {
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, u32 which);
+
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
{
struct bpf_stab *stab;
@@ -148,9 +151,9 @@ static void sock_map_del_link(struct sock *sk,
struct bpf_map *map = link->map;
struct bpf_stab *stab = container_of(map, struct bpf_stab,
map);
- if (psock->parser.enabled && stab->progs.skb_parser)
+ if (psock->saved_data_ready && stab->progs.stream_parser)
strp_stop = true;
- if (psock->parser.enabled && stab->progs.skb_verdict)
+ if (psock->saved_data_ready && stab->progs.stream_verdict)
verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
@@ -224,23 +227,23 @@ out:
static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
struct sock *sk)
{
- struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
+ struct bpf_prog *msg_parser, *stream_parser, *stream_verdict;
struct sk_psock *psock;
int ret;
- skb_verdict = READ_ONCE(progs->skb_verdict);
- if (skb_verdict) {
- skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
- if (IS_ERR(skb_verdict))
- return PTR_ERR(skb_verdict);
+ stream_verdict = READ_ONCE(progs->stream_verdict);
+ if (stream_verdict) {
+ stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
+ if (IS_ERR(stream_verdict))
+ return PTR_ERR(stream_verdict);
}
- skb_parser = READ_ONCE(progs->skb_parser);
- if (skb_parser) {
- skb_parser = bpf_prog_inc_not_zero(skb_parser);
- if (IS_ERR(skb_parser)) {
- ret = PTR_ERR(skb_parser);
- goto out_put_skb_verdict;
+ stream_parser = READ_ONCE(progs->stream_parser);
+ if (stream_parser) {
+ stream_parser = bpf_prog_inc_not_zero(stream_parser);
+ if (IS_ERR(stream_parser)) {
+ ret = PTR_ERR(stream_parser);
+ goto out_put_stream_verdict;
}
}
@@ -249,7 +252,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
msg_parser = bpf_prog_inc_not_zero(msg_parser);
if (IS_ERR(msg_parser)) {
ret = PTR_ERR(msg_parser);
- goto out_put_skb_parser;
+ goto out_put_stream_parser;
}
}
@@ -261,8 +264,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
if (psock) {
if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
- (skb_parser && READ_ONCE(psock->progs.skb_parser)) ||
- (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
+ (stream_parser && READ_ONCE(psock->progs.stream_parser)) ||
+ (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
sk_psock_put(sk, psock);
ret = -EBUSY;
goto out_progs;
@@ -283,15 +286,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
goto out_drop;
write_lock_bh(&sk->sk_callback_lock);
- if (skb_parser && skb_verdict && !psock->parser.enabled) {
+ if (stream_parser && stream_verdict && !psock->saved_data_ready) {
ret = sk_psock_init_strp(sk, psock);
if (ret)
goto out_unlock_drop;
- psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
- psock_set_prog(&psock->progs.skb_parser, skb_parser);
+ psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
+ psock_set_prog(&psock->progs.stream_parser, stream_parser);
sk_psock_start_strp(sk, psock);
- } else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
- psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+ } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
+ psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
sk_psock_start_verdict(sk,psock);
}
write_unlock_bh(&sk->sk_callback_lock);
@@ -303,12 +306,12 @@ out_drop:
out_progs:
if (msg_parser)
bpf_prog_put(msg_parser);
-out_put_skb_parser:
- if (skb_parser)
- bpf_prog_put(skb_parser);
-out_put_skb_verdict:
- if (skb_verdict)
- bpf_prog_put(skb_verdict);
+out_put_stream_parser:
+ if (stream_parser)
+ bpf_prog_put(stream_parser);
+out_put_stream_verdict:
+ if (stream_verdict)
+ bpf_prog_put(stream_verdict);
return ret;
}
@@ -657,7 +660,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -667,8 +669,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = sk;
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
}
@@ -1250,7 +1251,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = {
BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
struct bpf_map *, map, void *, key, u64, flags)
{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -1260,8 +1260,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = sk;
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
}
@@ -1448,8 +1447,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
return NULL;
}
-int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which)
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, u32 which)
{
struct sk_psock_progs *progs = sock_map_progs(map);
struct bpf_prog **pprog;
@@ -1461,11 +1460,13 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
case BPF_SK_MSG_VERDICT:
pprog = &progs->msg_parser;
break;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
case BPF_SK_SKB_STREAM_PARSER:
- pprog = &progs->skb_parser;
+ pprog = &progs->stream_parser;
break;
+#endif
case BPF_SK_SKB_STREAM_VERDICT:
- pprog = &progs->skb_verdict;
+ pprog = &progs->stream_verdict;
break;
default:
return -EOPNOTSUPP;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 4567de519603..d84c8a1b280e 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -24,6 +24,7 @@
static int two = 2;
static int three = 3;
+static int int_3600 = 3600;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS;
@@ -570,6 +571,15 @@ static struct ctl_table net_core_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
+ {
+ .procname = "netdev_unregister_timeout_secs",
+ .data = &netdev_unregister_timeout_secs,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &int_3600,
+ },
{ }
};
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 2193ae529e75..32b1bed8ae51 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -84,8 +84,7 @@
#include <net/dn_neigh.h>
#include <net/dn_fib.h>
-struct dn_rt_hash_bucket
-{
+struct dn_rt_hash_bucket {
struct dn_route __rcu *chain;
spinlock_t lock;
};
@@ -93,7 +92,7 @@ struct dn_rt_hash_bucket
extern struct neigh_table dn_neigh_table;
-static unsigned char dn_hiord_addr[6] = {0xAA,0x00,0x04,0x00,0x00,0x00};
+static unsigned char dn_hiord_addr[6] = {0xAA, 0x00, 0x04, 0x00, 0x00, 0x00};
static const int dn_rt_min_delay = 2 * HZ;
static const int dn_rt_max_delay = 10 * HZ;
@@ -359,10 +358,11 @@ static void dn_run_flush(struct timer_list *unused)
for (i = 0; i < dn_rt_hash_mask; i++) {
spin_lock_bh(&dn_rt_hash_table[i].lock);
- if ((rt = xchg((struct dn_route **)&dn_rt_hash_table[i].chain, NULL)) == NULL)
+ rt = xchg((struct dn_route **)&dn_rt_hash_table[i].chain, NULL);
+ if (!rt)
goto nothing_to_declare;
- for(; rt; rt = next) {
+ for (; rt; rt = next) {
next = rcu_dereference_raw(rt->dn_next);
RCU_INIT_POINTER(rt->dn_next, NULL);
dst_dev_put(&rt->dst);
@@ -425,7 +425,8 @@ static int dn_return_short(struct sk_buff *skb)
/* Add back headers */
skb_push(skb, skb->data - skb_network_header(skb));
- if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL)
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
return NET_RX_DROP;
cb = DN_SKB_CB(skb);
@@ -461,7 +462,8 @@ static int dn_return_long(struct sk_buff *skb)
/* Add back all headers */
skb_push(skb, skb->data - skb_network_header(skb));
- if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL)
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
return NET_RX_DROP;
cb = DN_SKB_CB(skb);
@@ -505,7 +507,8 @@ static int dn_route_rx_packet(struct net *net, struct sock *sk, struct sk_buff *
struct dn_skb_cb *cb;
int err;
- if ((err = dn_route_input(skb)) == 0)
+ err = dn_route_input(skb);
+ if (err == 0)
return dst_input(skb);
cb = DN_SKB_CB(skb);
@@ -629,7 +632,8 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type
if (dn == NULL)
goto dump_it;
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
goto out;
if (!pskb_may_pull(skb, 3))
@@ -898,7 +902,7 @@ static inline int dn_match_addr(__le16 addr1, __le16 addr2)
{
__u16 tmp = le16_to_cpu(addr1) ^ le16_to_cpu(addr2);
int match = 16;
- while(tmp) {
+ while (tmp) {
tmp >>= 1;
match--;
}
@@ -1324,7 +1328,8 @@ static int dn_route_input_slow(struct sk_buff *skb)
dev_hold(in_dev);
- if ((dn_db = rcu_dereference(in_dev->dn_ptr)) == NULL)
+ dn_db = rcu_dereference(in_dev->dn_ptr);
+ if (!dn_db)
goto out;
/* Zero source addresses are not allowed */
@@ -1383,7 +1388,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
fld.saddr = src_map;
}
- switch(res.type) {
+ switch (res.type) {
case RTN_UNICAST:
/*
* Forwarding check here, we only check for forwarding
@@ -1407,7 +1412,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
flags |= RTCF_DOREDIRECT;
local_src = DN_FIB_RES_PREFSRC(res);
-
+ break;
case RTN_BLACKHOLE:
case RTN_UNREACHABLE:
break;
@@ -1526,7 +1531,7 @@ static int dn_route_input(struct sk_buff *skb)
return 0;
rcu_read_lock();
- for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
+ for (rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
rt = rcu_dereference(rt->dn_next)) {
if ((rt->fld.saddr == cb->src) &&
(rt->fld.daddr == cb->dst) &&
@@ -1739,13 +1744,13 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
s_h = cb->args[0];
s_idx = idx = cb->args[1];
- for(h = 0; h <= dn_rt_hash_mask; h++) {
+ for (h = 0; h <= dn_rt_hash_mask; h++) {
if (h < s_h)
continue;
if (h > s_h)
s_idx = 0;
rcu_read_lock_bh();
- for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0;
+ for (rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0;
rt;
rt = rcu_dereference_bh(rt->dn_next), idx++) {
if (idx < s_idx)
@@ -1779,7 +1784,7 @@ static struct dn_route *dn_rt_cache_get_first(struct seq_file *seq)
struct dn_route *rt = NULL;
struct dn_rt_cache_iter_state *s = seq->private;
- for(s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) {
+ for (s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) {
rcu_read_lock_bh();
rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain);
if (rt)
@@ -1809,7 +1814,7 @@ static void *dn_rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
struct dn_route *rt = dn_rt_cache_get_first(seq);
if (rt) {
- while(*pos && (rt = dn_rt_cache_get_next(seq, rt)))
+ while (*pos && (rt = dn_rt_cache_get_next(seq, rt)))
--*pos;
}
return *pos ? NULL : rt;
@@ -1864,21 +1869,21 @@ void __init dn_route_init(void)
goal = totalram_pages() >> (26 - PAGE_SHIFT);
- for(order = 0; (1UL << order) < goal; order++)
+ for (order = 0; (1UL << order) < goal; order++)
/* NOTHING */;
/*
* Only want 1024 entries max, since the table is very, very unlikely
* to be larger than that.
*/
- while(order && ((((1UL << order) * PAGE_SIZE) /
+ while (order && ((((1UL << order) * PAGE_SIZE) /
sizeof(struct dn_rt_hash_bucket)) >= 2048))
order--;
do {
dn_rt_hash_mask = (1UL << order) * PAGE_SIZE /
sizeof(struct dn_rt_hash_bucket);
- while(dn_rt_hash_mask & (dn_rt_hash_mask - 1))
+ while (dn_rt_hash_mask & (dn_rt_hash_mask - 1))
dn_rt_hash_mask--;
dn_rt_hash_table = (struct dn_rt_hash_bucket *)
__get_free_pages(GFP_ATOMIC, order);
@@ -1893,7 +1898,7 @@ void __init dn_route_init(void)
(long)(dn_rt_hash_mask*sizeof(struct dn_rt_hash_bucket))/1024);
dn_rt_hash_mask--;
- for(i = 0; i <= dn_rt_hash_mask; i++) {
+ for (i = 0; i <= dn_rt_hash_mask; i++) {
spin_lock_init(&dn_rt_hash_table[i].lock);
dn_rt_hash_table[i].chain = NULL;
}
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 58b8fc82cd3c..8746b07668ae 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -1,15 +1,10 @@
# SPDX-License-Identifier: GPL-2.0-only
-config HAVE_NET_DSA
- def_bool y
- depends on INET && NETDEVICES && !S390
-
-# Drivers must select NET_DSA and the appropriate tagging format
menuconfig NET_DSA
tristate "Distributed Switch Architecture"
- depends on HAVE_NET_DSA
depends on BRIDGE || BRIDGE=n
depends on HSR || HSR=n
+ depends on INET && NETDEVICES
select GRO_CELLS
select NET_SWITCHDEV
select PHYLINK
@@ -20,7 +15,8 @@ menuconfig NET_DSA
if NET_DSA
-# tagging formats
+# Drivers must select the appropriate tagging format(s)
+
config NET_DSA_TAG_8021Q
tristate
select VLAN_8021Q
@@ -48,6 +44,13 @@ config NET_DSA_TAG_BRCM
Say Y if you want to enable support for tagging frames for the
Broadcom switches which place the tag after the MAC source address.
+config NET_DSA_TAG_BRCM_LEGACY
+ tristate "Tag driver for Broadcom legacy switches using in-frame headers"
+ select NET_DSA_TAG_BRCM_COMMON
+ help
+ Say Y if you want to enable support for tagging frames for the
+ Broadcom legacy switches which place the tag after the MAC source
+ address.
config NET_DSA_TAG_BRCM_PREPEND
tristate "Tag driver for Broadcom switches using prepended headers"
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 9d4b0e9b1aa1..92282de54230 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -181,12 +181,14 @@ int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy);
int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy);
void dsa_port_disable_rt(struct dsa_port *dp);
void dsa_port_disable(struct dsa_port *dp);
-int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br);
+int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
+ struct netlink_ext_ack *extack);
void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
int dsa_port_lag_change(struct dsa_port *dp,
struct netdev_lag_lower_state_info *linfo);
int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev,
- struct netdev_lag_upper_info *uinfo);
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack);
void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
struct netlink_ext_ack *extack);
@@ -233,19 +235,7 @@ extern const struct phylink_mac_ops dsa_port_phylink_mac_ops;
static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp,
struct net_device *dev)
{
- /* Switchdev offloading can be configured on: */
-
- if (dev == dp->slave)
- /* DSA ports directly connected to a bridge, and event
- * was emitted for the ports themselves.
- */
- return true;
-
- if (dp->lag_dev == dev)
- /* DSA ports connected to a bridge via a LAG */
- return true;
-
- return false;
+ return dsa_port_to_bridge_port(dp) == dev;
}
static inline bool dsa_port_offloads_bridge(struct dsa_port *dp,
@@ -272,6 +262,9 @@ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst,
/* slave.c */
extern const struct dsa_device_ops notag_netdev_ops;
+extern struct notifier_block dsa_slave_switchdev_notifier;
+extern struct notifier_block dsa_slave_switchdev_blocking_notifier;
+
void dsa_slave_mii_bus_init(struct dsa_switch *ds);
int dsa_slave_create(struct dsa_port *dp);
void dsa_slave_destroy(struct net_device *slave_dev);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index c9c6d7ab3f47..01e30264b25b 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -122,29 +122,132 @@ void dsa_port_disable(struct dsa_port *dp)
rtnl_unlock();
}
-static void dsa_port_change_brport_flags(struct dsa_port *dp,
- bool bridge_offload)
+static int dsa_port_inherit_brport_flags(struct dsa_port *dp,
+ struct netlink_ext_ack *extack)
{
- struct switchdev_brport_flags flags;
- int flag;
+ const unsigned long mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
+ BR_BCAST_FLOOD;
+ struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+ int flag, err;
- flags.mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
- if (bridge_offload)
- flags.val = flags.mask;
- else
- flags.val = flags.mask & ~BR_LEARNING;
+ for_each_set_bit(flag, &mask, 32) {
+ struct switchdev_brport_flags flags = {0};
+
+ flags.mask = BIT(flag);
+
+ if (br_port_flag_is_set(brport_dev, BIT(flag)))
+ flags.val = BIT(flag);
- for_each_set_bit(flag, &flags.mask, 32) {
- struct switchdev_brport_flags tmp;
+ err = dsa_port_bridge_flags(dp, flags, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ }
- tmp.val = flags.val & BIT(flag);
- tmp.mask = BIT(flag);
+ return 0;
+}
- dsa_port_bridge_flags(dp, tmp, NULL);
+static void dsa_port_clear_brport_flags(struct dsa_port *dp)
+{
+ const unsigned long val = BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
+ const unsigned long mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
+ BR_BCAST_FLOOD;
+ int flag, err;
+
+ for_each_set_bit(flag, &mask, 32) {
+ struct switchdev_brport_flags flags = {0};
+
+ flags.mask = BIT(flag);
+ flags.val = val & BIT(flag);
+
+ err = dsa_port_bridge_flags(dp, flags, NULL);
+ if (err && err != -EOPNOTSUPP)
+ dev_err(dp->ds->dev,
+ "failed to clear bridge port flag %lu: %pe\n",
+ flags.val, ERR_PTR(err));
}
}
-int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br)
+static int dsa_port_switchdev_sync(struct dsa_port *dp,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+ struct net_device *br = dp->bridge_dev;
+ int err;
+
+ err = dsa_port_inherit_brport_flags(dp, extack);
+ if (err)
+ return err;
+
+ err = dsa_port_set_state(dp, br_port_get_stp_state(brport_dev));
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = dsa_port_vlan_filtering(dp, br_vlan_enabled(br), extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = dsa_port_mrouter(dp->cpu_dp, br_multicast_router(br), extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = dsa_port_ageing_time(dp, br_get_ageing_time(br));
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = br_mdb_replay(br, brport_dev,
+ &dsa_slave_switchdev_blocking_notifier,
+ extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = br_fdb_replay(br, brport_dev, &dsa_slave_switchdev_notifier);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = br_vlan_replay(br, brport_dev,
+ &dsa_slave_switchdev_blocking_notifier,
+ extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ return 0;
+}
+
+static void dsa_port_switchdev_unsync(struct dsa_port *dp)
+{
+ /* Configure the port for standalone mode (no address learning,
+ * flood everything).
+ * The bridge only emits SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS events
+ * when the user requests it through netlink or sysfs, but not
+ * automatically at port join or leave, so we need to handle resetting
+ * the brport flags ourselves. But we even prefer it that way, because
+ * otherwise, some setups might never get the notification they need,
+ * for example, when a port leaves a LAG that offloads the bridge,
+ * it becomes standalone, but as far as the bridge is concerned, no
+ * port ever left.
+ */
+ dsa_port_clear_brport_flags(dp);
+
+ /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
+ * so allow it to be in BR_STATE_FORWARDING to be kept functional
+ */
+ dsa_port_set_state_now(dp, BR_STATE_FORWARDING);
+
+ /* VLAN filtering is handled by dsa_switch_bridge_leave */
+
+ /* Some drivers treat the notification for having a local multicast
+ * router by allowing multicast to be flooded to the CPU, so we should
+ * allow this in standalone mode too.
+ */
+ dsa_port_mrouter(dp->cpu_dp, true, NULL);
+
+ /* Ageing time may be global to the switch chip, so don't change it
+ * here because we have no good reason (or value) to change it to.
+ */
+}
+
+int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
+ struct netlink_ext_ack *extack)
{
struct dsa_notifier_bridge_info info = {
.tree_index = dp->ds->dst->index,
@@ -154,24 +257,25 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br)
};
int err;
- /* Notify the port driver to set its configurable flags in a way that
- * matches the initial settings of a bridge port.
- */
- dsa_port_change_brport_flags(dp, true);
-
/* Here the interface is already bridged. Reflect the current
* configuration so that drivers can program their chips accordingly.
*/
dp->bridge_dev = br;
err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info);
+ if (err)
+ goto out_rollback;
- /* The bridging is rolled back on error */
- if (err) {
- dsa_port_change_brport_flags(dp, false);
- dp->bridge_dev = NULL;
- }
+ err = dsa_port_switchdev_sync(dp, extack);
+ if (err)
+ goto out_rollback_unbridge;
+ return 0;
+
+out_rollback_unbridge:
+ dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
+out_rollback:
+ dp->bridge_dev = NULL;
return err;
}
@@ -194,23 +298,7 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br)
if (err)
pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n");
- /* Configure the port for standalone mode (no address learning,
- * flood everything).
- * The bridge only emits SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS events
- * when the user requests it through netlink or sysfs, but not
- * automatically at port join or leave, so we need to handle resetting
- * the brport flags ourselves. But we even prefer it that way, because
- * otherwise, some setups might never get the notification they need,
- * for example, when a port leaves a LAG that offloads the bridge,
- * it becomes standalone, but as far as the bridge is concerned, no
- * port ever left.
- */
- dsa_port_change_brport_flags(dp, false);
-
- /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
- * so allow it to be in BR_STATE_FORWARDING to be kept functional
- */
- dsa_port_set_state_now(dp, BR_STATE_FORWARDING);
+ dsa_port_switchdev_unsync(dp);
}
int dsa_port_lag_change(struct dsa_port *dp,
@@ -241,7 +329,8 @@ int dsa_port_lag_change(struct dsa_port *dp,
}
int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag,
- struct netdev_lag_upper_info *uinfo)
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack)
{
struct dsa_notifier_lag_info info = {
.sw_index = dp->ds->index,
@@ -249,17 +338,31 @@ int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag,
.lag = lag,
.info = uinfo,
};
+ struct net_device *bridge_dev;
int err;
dsa_lag_map(dp->ds->dst, lag);
dp->lag_dev = lag;
err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_JOIN, &info);
- if (err) {
- dp->lag_dev = NULL;
- dsa_lag_unmap(dp->ds->dst, lag);
- }
+ if (err)
+ goto err_lag_join;
+ bridge_dev = netdev_master_upper_dev_get(lag);
+ if (!bridge_dev || !netif_is_bridge_master(bridge_dev))
+ return 0;
+
+ err = dsa_port_bridge_join(dp, bridge_dev, extack);
+ if (err)
+ goto err_bridge_join;
+
+ return 0;
+
+err_bridge_join:
+ dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info);
+err_lag_join:
+ dp->lag_dev = NULL;
+ dsa_lag_unmap(dp->ds->dst, lag);
return err;
}
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 992fcab4b552..995e0e16f295 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1278,14 +1278,32 @@ static int dsa_slave_setup_tc_block(struct net_device *dev,
}
}
+static int dsa_slave_setup_ft_block(struct dsa_switch *ds, int port,
+ void *type_data)
+{
+ struct dsa_port *cpu_dp = dsa_to_port(ds, port)->cpu_dp;
+ struct net_device *master = cpu_dp->master;
+
+ if (!master->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ return master->netdev_ops->ndo_setup_tc(master, TC_SETUP_FT, type_data);
+}
+
static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
void *type_data)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
struct dsa_switch *ds = dp->ds;
- if (type == TC_SETUP_BLOCK)
+ switch (type) {
+ case TC_SETUP_BLOCK:
return dsa_slave_setup_tc_block(dev, type_data);
+ case TC_SETUP_FT:
+ return dsa_slave_setup_ft_block(ds, dp->index, type_data);
+ default:
+ break;
+ }
if (!ds->ops->port_setup_tc)
return -EOPNOTSUPP;
@@ -1654,6 +1672,21 @@ static void dsa_slave_get_stats64(struct net_device *dev,
dev_get_tstats64(dev, s);
}
+static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct dsa_port *dp = dsa_slave_to_port(ctx->dev);
+ struct dsa_port *cpu_dp = dp->cpu_dp;
+
+ path->dev = ctx->dev;
+ path->type = DEV_PATH_DSA;
+ path->dsa.proto = cpu_dp->tag_ops->proto;
+ path->dsa.port = dp->index;
+ ctx->dev = cpu_dp->master;
+
+ return 0;
+}
+
static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_open = dsa_slave_open,
.ndo_stop = dsa_slave_close,
@@ -1679,6 +1712,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid,
.ndo_get_devlink_port = dsa_slave_get_devlink_port,
.ndo_change_mtu = dsa_slave_change_mtu,
+ .ndo_fill_forward_path = dsa_slave_fill_forward_path,
};
static struct device_type dsa_type = {
@@ -1976,11 +2010,14 @@ static int dsa_slave_changeupper(struct net_device *dev,
struct netdev_notifier_changeupper_info *info)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct netlink_ext_ack *extack;
int err = NOTIFY_DONE;
+ extack = netdev_notifier_info_to_extack(&info->info);
+
if (netif_is_bridge_master(info->upper_dev)) {
if (info->linking) {
- err = dsa_port_bridge_join(dp, info->upper_dev);
+ err = dsa_port_bridge_join(dp, info->upper_dev, extack);
if (!err)
dsa_bridge_mtu_normalization(dp);
err = notifier_from_errno(err);
@@ -1991,7 +2028,7 @@ static int dsa_slave_changeupper(struct net_device *dev,
} else if (netif_is_lag_master(info->upper_dev)) {
if (info->linking) {
err = dsa_port_lag_join(dp, info->upper_dev,
- info->upper_info);
+ info->upper_info, extack);
if (err == -EOPNOTSUPP) {
NL_SET_ERR_MSG_MOD(info->info.extack,
"Offloading not supported");
@@ -2389,11 +2426,11 @@ static struct notifier_block dsa_slave_nb __read_mostly = {
.notifier_call = dsa_slave_netdevice_event,
};
-static struct notifier_block dsa_slave_switchdev_notifier = {
+struct notifier_block dsa_slave_switchdev_notifier = {
.notifier_call = dsa_slave_switchdev_event,
};
-static struct notifier_block dsa_slave_switchdev_blocking_notifier = {
+struct notifier_block dsa_slave_switchdev_blocking_notifier = {
.notifier_call = dsa_slave_switchdev_blocking_event,
};
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index e2577a7dcbca..40e9f3098c8d 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -12,9 +12,26 @@
#include "dsa_priv.h"
-/* This tag length is 4 bytes, older ones were 6 bytes, we do not
- * handle them
- */
+/* Legacy Broadcom tag (6 bytes) */
+#define BRCM_LEG_TAG_LEN 6
+
+/* Type fields */
+/* 1st byte in the tag */
+#define BRCM_LEG_TYPE_HI 0x88
+/* 2nd byte in the tag */
+#define BRCM_LEG_TYPE_LO 0x74
+
+/* Tag fields */
+/* 3rd byte in the tag */
+#define BRCM_LEG_UNICAST (0 << 5)
+#define BRCM_LEG_MULTICAST (1 << 5)
+#define BRCM_LEG_EGRESS (2 << 5)
+#define BRCM_LEG_INGRESS (3 << 5)
+
+/* 6th byte in the tag */
+#define BRCM_LEG_PORT_ID (0xf)
+
+/* Newer Broadcom tag (4 bytes) */
#define BRCM_TAG_LEN 4
/* Tag is constructed and desconstructed using byte by byte access
@@ -195,6 +212,87 @@ DSA_TAG_DRIVER(brcm_netdev_ops);
MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM);
#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY)
+static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ u8 *brcm_tag;
+
+ /* The Ethernet switch we are interfaced with needs packets to be at
+ * least 64 bytes (including FCS) otherwise they will be discarded when
+ * they enter the switch port logic. When Broadcom tags are enabled, we
+ * need to make sure that packets are at least 70 bytes
+ * (including FCS and tag) because the length verification is done after
+ * the Broadcom tag is stripped off the ingress packet.
+ *
+ * Let dsa_slave_xmit() free the SKB
+ */
+ if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
+ return NULL;
+
+ skb_push(skb, BRCM_LEG_TAG_LEN);
+
+ memmove(skb->data, skb->data + BRCM_LEG_TAG_LEN, 2 * ETH_ALEN);
+
+ brcm_tag = skb->data + 2 * ETH_ALEN;
+
+ /* Broadcom tag type */
+ brcm_tag[0] = BRCM_LEG_TYPE_HI;
+ brcm_tag[1] = BRCM_LEG_TYPE_LO;
+
+ /* Broadcom tag value */
+ brcm_tag[2] = BRCM_LEG_EGRESS;
+ brcm_tag[3] = 0;
+ brcm_tag[4] = 0;
+ brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID;
+
+ return skb;
+}
+
+static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev,
+ struct packet_type *pt)
+{
+ int source_port;
+ u8 *brcm_tag;
+
+ if (unlikely(!pskb_may_pull(skb, BRCM_LEG_PORT_ID)))
+ return NULL;
+
+ brcm_tag = skb->data - 2;
+
+ source_port = brcm_tag[5] & BRCM_LEG_PORT_ID;
+
+ skb->dev = dsa_master_find_slave(dev, 0, source_port);
+ if (!skb->dev)
+ return NULL;
+
+ /* Remove Broadcom tag and update checksum */
+ skb_pull_rcsum(skb, BRCM_LEG_TAG_LEN);
+
+ skb->offload_fwd_mark = 1;
+
+ /* Move the Ethernet DA and SA */
+ memmove(skb->data - ETH_HLEN,
+ skb->data - ETH_HLEN - BRCM_LEG_TAG_LEN,
+ 2 * ETH_ALEN);
+
+ return skb;
+}
+
+static const struct dsa_device_ops brcm_legacy_netdev_ops = {
+ .name = "brcm-legacy",
+ .proto = DSA_TAG_PROTO_BRCM_LEGACY,
+ .xmit = brcm_leg_tag_xmit,
+ .rcv = brcm_leg_tag_rcv,
+ .overhead = BRCM_LEG_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(brcm_legacy_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY);
+#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */
+
#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb,
struct net_device *dev)
@@ -227,6 +325,9 @@ static struct dsa_tag_driver *dsa_tag_driver_array[] = {
#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM)
&DSA_TAG_DRIVER_NAME(brcm_netdev_ops),
#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY)
+ &DSA_TAG_DRIVER_NAME(brcm_legacy_netdev_ops),
+#endif
#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
&DSA_TAG_DRIVER_NAME(brcm_prepend_netdev_ops),
#endif
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 59748487664f..f9b2966d1936 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -24,9 +24,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
struct dsa_port *dp = dsa_slave_to_port(dev);
u8 xmit_tpid;
u8 *mtk_tag;
- unsigned char *dest = eth_hdr(skb)->h_dest;
- bool is_multicast_skb = is_multicast_ether_addr(dest) &&
- !is_broadcast_ether_addr(dest);
/* Build the special tag after the MAC Source Address. If VLAN header
* is present, it's required that VLAN header and special tag is
@@ -55,10 +52,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
mtk_tag[0] = xmit_tpid;
mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK;
- /* Disable SA learning for multicast frames */
- if (unlikely(is_multicast_skb))
- mtk_tag[1] |= MTK_HDR_XMIT_SA_DIS;
-
/* Tag control information is kept for 802.1Q */
if (xmit_tpid == MTK_HDR_XMIT_UNTAGGED) {
mtk_tag[2] = 0;
@@ -74,9 +67,6 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
u16 hdr;
int port;
__be16 *phdr;
- unsigned char *dest = eth_hdr(skb)->h_dest;
- bool is_multicast_skb = is_multicast_ether_addr(dest) &&
- !is_broadcast_ether_addr(dest);
if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN)))
return NULL;
@@ -102,9 +92,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
if (!skb->dev)
return NULL;
- /* Only unicast or broadcast frames are offloaded */
- if (likely(!is_multicast_skb))
- skb->offload_fwd_mark = 1;
+ skb->offload_fwd_mark = 1;
return skb;
}
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 743809b5806b..f9df9cac81c5 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -83,7 +83,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
struct dsa_port *dp;
u8 *extraction;
u16 vlan_tpid;
- u64 cpuq;
/* Revert skb->data by the amount consumed by the DSA master,
* so it points to the beginning of the frame.
@@ -113,7 +112,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
ocelot_xfh_get_qos_class(extraction, &qos_class);
ocelot_xfh_get_tag_type(extraction, &tag_type);
ocelot_xfh_get_vlan_tci(extraction, &vlan_tci);
- ocelot_xfh_get_cpuq(extraction, &cpuq);
skb->dev = dsa_master_find_slave(netdev, 0, src_port);
if (!skb->dev)
@@ -128,12 +126,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
skb->offload_fwd_mark = 1;
skb->priority = qos_class;
-#if IS_ENABLED(CONFIG_BRIDGE_MRP)
- if (eth_hdr(skb)->h_proto == cpu_to_be16(ETH_P_MRP) &&
- cpuq & BIT(OCELOT_MRP_CPUQ))
- skb->offload_fwd_mark = 0;
-#endif
-
/* Ocelot switches copy frames unmodified to the CPU. However, it is
* possible for the user to request a VLAN modification through
* VCAP_IS1_ACT_VID_REPLACE_ENA. In this case, what will happen is that
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 4106373180c6..933b427122be 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -58,6 +58,7 @@
#include <net/ip.h>
#include <net/dsa.h>
#include <net/flow_dissector.h>
+#include <net/gro.h>
#include <linux/uaccess.h>
#include <net/pkt_sched.h>
@@ -122,7 +123,7 @@ EXPORT_SYMBOL(eth_header);
* Make a best effort attempt to pull the length for all of the headers for
* a given frame in a linear buffer.
*/
-u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len)
+u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len)
{
const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
const struct ethhdr *eth = (const struct ethhdr *)data;
@@ -449,7 +450,10 @@ struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
skb_gro_pull(skb, sizeof(*eh));
skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
- pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
+
+ pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ head, skb);
out_unlock:
rcu_read_unlock();
@@ -473,8 +477,9 @@ int eth_gro_complete(struct sk_buff *skb, int nhoff)
rcu_read_lock();
ptype = gro_find_complete_by_type(type);
if (ptype != NULL)
- err = ptype->callbacks.gro_complete(skb, nhoff +
- sizeof(struct ethhdr));
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, nhoff + sizeof(*eh));
rcu_read_unlock();
return err;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 24783b71c584..0788cc3b3114 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1844,6 +1844,18 @@ out:
return ret;
}
+__printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vsnprintf(*data, ETH_GSTRING_LEN, fmt, args);
+ va_end(args);
+
+ *data += ETH_GSTRING_LEN;
+}
+EXPORT_SYMBOL(ethtool_sprintf);
+
static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
{
struct ethtool_value id;
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
index 4cfd9e829c7b..99f3af1a9d4d 100644
--- a/net/hsr/hsr_debugfs.c
+++ b/net/hsr/hsr_debugfs.c
@@ -108,7 +108,7 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
/* hsr_debugfs_term - Tear down debugfs intrastructure
*
* Description:
- * When Debufs is configured this routine removes debugfs file system
+ * When Debugfs is configured this routine removes debugfs file system
* elements that are specific to hsr
*/
void
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 5b77a46885b9..bbdd9c44f14e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -62,7 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
-obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o
+obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 743777bce179..f09fe3a5608f 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -16,6 +16,9 @@
#include <net/route.h>
#include <net/sock.h>
+#define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ)
+#define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */
+
static void remove_nexthop(struct net *net, struct nexthop *nh,
struct nl_info *nlinfo);
@@ -32,6 +35,7 @@ static const struct nla_policy rtm_nh_policy_new[] = {
[NHA_ENCAP_TYPE] = { .type = NLA_U16 },
[NHA_ENCAP] = { .type = NLA_NESTED },
[NHA_FDB] = { .type = NLA_FLAG },
+ [NHA_RES_GROUP] = { .type = NLA_NESTED },
};
static const struct nla_policy rtm_nh_policy_get[] = {
@@ -45,6 +49,32 @@ static const struct nla_policy rtm_nh_policy_dump[] = {
[NHA_FDB] = { .type = NLA_FLAG },
};
+static const struct nla_policy rtm_nh_res_policy_new[] = {
+ [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 },
+ [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 },
+ [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_dump_bucket[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_OIF] = { .type = NLA_U32 },
+ [NHA_MASTER] = { .type = NLA_U32 },
+ [NHA_RES_BUCKET] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = {
+ [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_get_bucket[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_RES_BUCKET] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy rtm_nh_res_bucket_policy_get[] = {
+ [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 },
+};
+
static bool nexthop_notifiers_is_empty(struct net *net)
{
return !net->nexthop.notifier_chain.head;
@@ -52,10 +82,8 @@ static bool nexthop_notifiers_is_empty(struct net *net)
static void
__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
- const struct nexthop *nh)
+ const struct nh_info *nhi)
{
- struct nh_info *nhi = rtnl_dereference(nh->nh_info);
-
nh_info->dev = nhi->fib_nhc.nhc_dev;
nh_info->gw_family = nhi->fib_nhc.nhc_gw_family;
if (nh_info->gw_family == AF_INET)
@@ -71,12 +99,14 @@ __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
static int nh_notifier_single_info_init(struct nh_notifier_info *info,
const struct nexthop *nh)
{
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
info->type = NH_NOTIFIER_INFO_TYPE_SINGLE;
info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
if (!info->nh)
return -ENOMEM;
- __nh_notifier_single_info_init(info->nh, nh);
+ __nh_notifier_single_info_init(info->nh, nhi);
return 0;
}
@@ -103,11 +133,44 @@ static int nh_notifier_mp_info_init(struct nh_notifier_info *info,
for (i = 0; i < num_nh; i++) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ struct nh_info *nhi;
+ nhi = rtnl_dereference(nhge->nh->nh_info);
info->nh_grp->nh_entries[i].id = nhge->nh->id;
info->nh_grp->nh_entries[i].weight = nhge->weight;
__nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
- nhge->nh);
+ nhi);
+ }
+
+ return 0;
+}
+
+static int nh_notifier_res_table_info_init(struct nh_notifier_info *info,
+ struct nh_group *nhg)
+{
+ struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
+ u16 num_nh_buckets = res_table->num_nh_buckets;
+ unsigned long size;
+ u16 i;
+
+ info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE;
+ size = struct_size(info->nh_res_table, nhs, num_nh_buckets);
+ info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO |
+ __GFP_NOWARN);
+ if (!info->nh_res_table)
+ return -ENOMEM;
+
+ info->nh_res_table->num_nh_buckets = num_nh_buckets;
+
+ for (i = 0; i < num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ struct nh_grp_entry *nhge;
+ struct nh_info *nhi;
+
+ nhge = rtnl_dereference(bucket->nh_entry);
+ nhi = rtnl_dereference(nhge->nh->nh_info);
+ __nh_notifier_single_info_init(&info->nh_res_table->nhs[i],
+ nhi);
}
return 0;
@@ -120,6 +183,8 @@ static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
if (nhg->mpath)
return nh_notifier_mp_info_init(info, nhg);
+ else if (nhg->resilient)
+ return nh_notifier_res_table_info_init(info, nhg);
return -EINVAL;
}
@@ -130,6 +195,8 @@ static void nh_notifier_grp_info_fini(struct nh_notifier_info *info,
if (nhg->mpath)
kfree(info->nh_grp);
+ else if (nhg->resilient)
+ vfree(info->nh_res_table);
}
static int nh_notifier_info_init(struct nh_notifier_info *info,
@@ -181,6 +248,178 @@ static int call_nexthop_notifiers(struct net *net,
return notifier_to_errno(err);
}
+static int
+nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info,
+ bool force, unsigned int *p_idle_timer_ms)
+{
+ struct nh_res_table *res_table;
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ int err = 0;
+
+ /* When 'force' is false, nexthop bucket replacement is performed
+ * because the bucket was deemed to be idle. In this case, capable
+ * listeners can choose to perform an atomic replacement: The bucket is
+ * only replaced if it is inactive. However, if the idle timer interval
+ * is smaller than the interval in which a listener is querying
+ * buckets' activity from the device, then atomic replacement should
+ * not be tried. Pass the idle timer value to listeners, so that they
+ * could determine which type of replacement to perform.
+ */
+ if (force) {
+ *p_idle_timer_ms = 0;
+ return 0;
+ }
+
+ rcu_read_lock();
+
+ nh = nexthop_find_by_id(info->net, info->id);
+ if (!nh) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ nhg = rcu_dereference(nh->nh_grp);
+ res_table = rcu_dereference(nhg->res_table);
+ *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer);
+
+out:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info,
+ u16 bucket_index, bool force,
+ struct nh_info *oldi,
+ struct nh_info *newi)
+{
+ unsigned int idle_timer_ms;
+ int err;
+
+ err = nh_notifier_res_bucket_idle_timer_get(info, force,
+ &idle_timer_ms);
+ if (err)
+ return err;
+
+ info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET;
+ info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket),
+ GFP_KERNEL);
+ if (!info->nh_res_bucket)
+ return -ENOMEM;
+
+ info->nh_res_bucket->bucket_index = bucket_index;
+ info->nh_res_bucket->idle_timer_ms = idle_timer_ms;
+ info->nh_res_bucket->force = force;
+ __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi);
+ __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi);
+ return 0;
+}
+
+static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info)
+{
+ kfree(info->nh_res_bucket);
+}
+
+static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
+ u16 bucket_index, bool force,
+ struct nh_info *oldi,
+ struct nh_info *newi,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ .id = nhg_id,
+ };
+ int err;
+
+ if (nexthop_notifiers_is_empty(net))
+ return 0;
+
+ err = nh_notifier_res_bucket_info_init(&info, bucket_index, force,
+ oldi, newi);
+ if (err)
+ return err;
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ NEXTHOP_EVENT_BUCKET_REPLACE, &info);
+ nh_notifier_res_bucket_info_fini(&info);
+
+ return notifier_to_errno(err);
+}
+
+/* There are three users of RES_TABLE, and NHs etc. referenced from there:
+ *
+ * 1) a collection of callbacks for NH maintenance. This operates under
+ * RTNL,
+ * 2) the delayed work that gradually balances the resilient table,
+ * 3) and nexthop_select_path(), operating under RCU.
+ *
+ * Both the delayed work and the RTNL block are writers, and need to
+ * maintain mutual exclusion. Since there are only two and well-known
+ * writers for each table, the RTNL code can make sure it has exclusive
+ * access thus:
+ *
+ * - Have the DW operate without locking;
+ * - synchronously cancel the DW;
+ * - do the writing;
+ * - if the write was not actually a delete, call upkeep, which schedules
+ * DW again if necessary.
+ *
+ * The functions that are always called from the RTNL context use
+ * rtnl_dereference(). The functions that can also be called from the DW do
+ * a raw dereference and rely on the above mutual exclusion scheme.
+ */
+#define nh_res_dereference(p) (rcu_dereference_raw(p))
+
+static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
+ u16 bucket_index, bool force,
+ struct nexthop *old_nh,
+ struct nexthop *new_nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *oldi = nh_res_dereference(old_nh->nh_info);
+ struct nh_info *newi = nh_res_dereference(new_nh->nh_info);
+
+ return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index,
+ force, oldi, newi, extack);
+}
+
+static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ };
+ struct nh_group *nhg;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (nexthop_notifiers_is_empty(net))
+ return 0;
+
+ /* At this point, the nexthop buckets are still not populated. Only
+ * emit a notification with the logical nexthops, so that a listener
+ * could potentially veto it in case of unsupported configuration.
+ */
+ nhg = rtnl_dereference(nh->nh_grp);
+ err = nh_notifier_mp_info_init(&info, nhg);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
+ return err;
+ }
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
+ &info);
+ kfree(info.nh_grp);
+
+ return notifier_to_errno(err);
+}
+
static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
enum nexthop_event_type event_type,
struct nexthop *nh,
@@ -239,6 +478,9 @@ static void nexthop_free_group(struct nexthop *nh)
WARN_ON(nhg->spare == nhg);
+ if (nhg->resilient)
+ vfree(rcu_dereference_raw(nhg->res_table));
+
kfree(nhg->spare);
kfree(nhg);
}
@@ -297,6 +539,30 @@ static struct nh_group *nexthop_grp_alloc(u16 num_nh)
return nhg;
}
+static void nh_res_table_upkeep_dw(struct work_struct *work);
+
+static struct nh_res_table *
+nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg)
+{
+ const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets;
+ struct nh_res_table *res_table;
+ unsigned long size;
+
+ size = struct_size(res_table, nh_buckets, num_nh_buckets);
+ res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
+ if (!res_table)
+ return NULL;
+
+ res_table->net = net;
+ res_table->nhg_id = nhg_id;
+ INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw);
+ INIT_LIST_HEAD(&res_table->uw_nh_entries);
+ res_table->idle_timer = cfg->nh_grp_res_idle_timer;
+ res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer;
+ res_table->num_nh_buckets = num_nh_buckets;
+ return res_table;
+}
+
static void nh_base_seq_inc(struct net *net)
{
while (++net->nexthop.seq == 0)
@@ -345,6 +611,48 @@ static u32 nh_find_unused_id(struct net *net)
return 0;
}
+static void nh_res_time_set_deadline(unsigned long next_time,
+ unsigned long *deadline)
+{
+ if (time_before(next_time, *deadline))
+ *deadline = next_time;
+}
+
+static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table)
+{
+ if (list_empty(&res_table->uw_nh_entries))
+ return 0;
+ return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since);
+}
+
+static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg)
+{
+ struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NHA_RES_GROUP);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS,
+ res_table->num_nh_buckets) ||
+ nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER,
+ jiffies_to_clock_t(res_table->idle_timer)) ||
+ nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER,
+ jiffies_to_clock_t(res_table->unbalanced_timer)) ||
+ nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME,
+ nh_res_table_unbalanced_time(res_table),
+ NHA_RES_GROUP_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
{
struct nexthop_grp *p;
@@ -355,6 +663,8 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
if (nhg->mpath)
group_type = NEXTHOP_GRP_TYPE_MPATH;
+ else if (nhg->resilient)
+ group_type = NEXTHOP_GRP_TYPE_RES;
if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
goto nla_put_failure;
@@ -370,6 +680,9 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
p += 1;
}
+ if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -457,13 +770,26 @@ nla_put_failure:
return -EMSGSIZE;
}
+static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg)
+{
+ return nla_total_size(0) + /* NHA_RES_GROUP */
+ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */
+ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */
+ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */
+ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
+}
+
static size_t nh_nlmsg_size_grp(struct nexthop *nh)
{
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
+ size_t tot = nla_total_size(sz) +
+ nla_total_size(2); /* NHA_GROUP_TYPE */
+
+ if (nhg->resilient)
+ tot += nh_nlmsg_size_grp_res(nhg);
- return nla_total_size(sz) +
- nla_total_size(2); /* NHA_GROUP_TYPE */
+ return tot;
}
static size_t nh_nlmsg_size_single(struct nexthop *nh)
@@ -538,20 +864,144 @@ errout:
rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
}
+static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket)
+{
+ return (unsigned long)atomic_long_read(&bucket->used_time);
+}
+
+static unsigned long
+nh_res_bucket_idle_point(const struct nh_res_table *res_table,
+ const struct nh_res_bucket *bucket,
+ unsigned long now)
+{
+ unsigned long time = nh_res_bucket_used_time(bucket);
+
+ /* Bucket was not used since it was migrated. The idle time is now. */
+ if (time == bucket->migrated_time)
+ return now;
+
+ return time + res_table->idle_timer;
+}
+
+static unsigned long
+nh_res_table_unb_point(const struct nh_res_table *res_table)
+{
+ return res_table->unbalanced_since + res_table->unbalanced_timer;
+}
+
+static void nh_res_bucket_set_idle(const struct nh_res_table *res_table,
+ struct nh_res_bucket *bucket)
+{
+ unsigned long now = jiffies;
+
+ atomic_long_set(&bucket->used_time, (long)now);
+ bucket->migrated_time = now;
+}
+
+static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket)
+{
+ atomic_long_set(&bucket->used_time, (long)jiffies);
+}
+
+static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket)
+{
+ unsigned long used_time = nh_res_bucket_used_time(bucket);
+
+ return jiffies_delta_to_clock_t(jiffies - used_time);
+}
+
+static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh,
+ struct nh_res_bucket *bucket, u16 bucket_index,
+ int event, u32 portid, u32 seq,
+ unsigned int nlflags,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+ struct nhmsg *nhm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ nhm = nlmsg_data(nlh);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_flags = bucket->nh_flags;
+ nhm->nh_protocol = nh->protocol;
+ nhm->nh_scope = 0;
+ nhm->resvd = 0;
+
+ if (nla_put_u32(skb, NHA_ID, nh->id))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, NHA_RES_BUCKET);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) ||
+ nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) ||
+ nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME,
+ nh_res_bucket_idle_time(bucket),
+ NHA_RES_BUCKET_PAD))
+ goto nla_put_failure_nest;
+
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure_nest:
+ nla_nest_cancel(skb, nest);
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static void nexthop_bucket_notify(struct nh_res_table *res_table,
+ u16 bucket_index)
+{
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
+ struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
+ struct nexthop *nh = nhge->nh_parent;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
+ RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE,
+ NULL);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
+}
+
static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
bool *is_fdb, struct netlink_ext_ack *extack)
{
if (nh->is_group) {
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
- /* nested multipath (group within a group) is not
- * supported
- */
+ /* Nesting groups within groups is not supported. */
if (nhg->mpath) {
NL_SET_ERR_MSG(extack,
"Multipath group can not be a nexthop within a group");
return false;
}
+ if (nhg->resilient) {
+ NL_SET_ERR_MSG(extack,
+ "Resilient group can not be a nexthop within a group");
+ return false;
+ }
*is_fdb = nhg->fdb_nh;
} else {
struct nh_info *nhi = rtnl_dereference(nh->nh_info);
@@ -591,7 +1041,7 @@ static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
static int nh_check_attr_group(struct net *net,
struct nlattr *tb[], size_t tb_size,
- struct netlink_ext_ack *extack)
+ u16 nh_grp_type, struct netlink_ext_ack *extack)
{
unsigned int len = nla_len(tb[NHA_GROUP]);
u8 nh_family = AF_UNSPEC;
@@ -652,8 +1102,14 @@ static int nh_check_attr_group(struct net *net,
for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
if (!tb[i])
continue;
- if (i == NHA_FDB)
+ switch (i) {
+ case NHA_FDB:
continue;
+ case NHA_RES_GROUP:
+ if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
+ continue;
+ break;
+ }
NL_SET_ERR_MSG(extack,
"No other attributes can be set in nexthop groups");
return -EINVAL;
@@ -732,6 +1188,22 @@ static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash)
return rc;
}
+static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
+{
+ struct nh_res_table *res_table = rcu_dereference(nhg->res_table);
+ u16 bucket_index = hash % res_table->num_nh_buckets;
+ struct nh_res_bucket *bucket;
+ struct nh_grp_entry *nhge;
+
+ /* nexthop_select_path() is expected to return a non-NULL value, so
+ * skip protocol validation and just hand out whatever there is.
+ */
+ bucket = &res_table->nh_buckets[bucket_index];
+ nh_res_bucket_set_busy(bucket);
+ nhge = rcu_dereference(bucket->nh_entry);
+ return nhge->nh;
+}
+
struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
{
struct nh_group *nhg;
@@ -742,6 +1214,8 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
nhg = rcu_dereference(nh->nh_grp);
if (nhg->mpath)
return nexthop_select_path_mp(nhg, hash);
+ else if (nhg->resilient)
+ return nexthop_select_path_res(nhg, hash);
/* Unreachable. */
return NULL;
@@ -924,7 +1398,319 @@ static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
return 0;
}
-static void nh_group_rebalance(struct nh_group *nhg)
+static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge)
+{
+ return nhge->res.count_buckets == nhge->res.wants_buckets;
+}
+
+static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge)
+{
+ return nhge->res.count_buckets > nhge->res.wants_buckets;
+}
+
+static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge)
+{
+ return nhge->res.count_buckets < nhge->res.wants_buckets;
+}
+
+static bool nh_res_table_is_balanced(const struct nh_res_table *res_table)
+{
+ return list_empty(&res_table->uw_nh_entries);
+}
+
+static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket)
+{
+ struct nh_grp_entry *nhge;
+
+ if (bucket->occupied) {
+ nhge = nh_res_dereference(bucket->nh_entry);
+ nhge->res.count_buckets--;
+ bucket->occupied = false;
+ }
+}
+
+static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket,
+ struct nh_grp_entry *nhge)
+{
+ nh_res_bucket_unset_nh(bucket);
+
+ bucket->occupied = true;
+ rcu_assign_pointer(bucket->nh_entry, nhge);
+ nhge->res.count_buckets++;
+}
+
+static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table,
+ struct nh_res_bucket *bucket,
+ unsigned long *deadline, bool *force)
+{
+ unsigned long now = jiffies;
+ struct nh_grp_entry *nhge;
+ unsigned long idle_point;
+
+ if (!bucket->occupied) {
+ /* The bucket is not occupied, its NHGE pointer is either
+ * NULL or obsolete. We _have to_ migrate: set force.
+ */
+ *force = true;
+ return true;
+ }
+
+ nhge = nh_res_dereference(bucket->nh_entry);
+
+ /* If the bucket is populated by an underweight or balanced
+ * nexthop, do not migrate.
+ */
+ if (!nh_res_nhge_is_ow(nhge))
+ return false;
+
+ /* At this point we know that the bucket is populated with an
+ * overweight nexthop. It needs to be migrated to a new nexthop if
+ * the idle timer of unbalanced timer expired.
+ */
+
+ idle_point = nh_res_bucket_idle_point(res_table, bucket, now);
+ if (time_after_eq(now, idle_point)) {
+ /* The bucket is idle. We _can_ migrate: unset force. */
+ *force = false;
+ return true;
+ }
+
+ /* Unbalanced timer of 0 means "never force". */
+ if (res_table->unbalanced_timer) {
+ unsigned long unb_point;
+
+ unb_point = nh_res_table_unb_point(res_table);
+ if (time_after(now, unb_point)) {
+ /* The bucket is not idle, but the unbalanced timer
+ * expired. We _can_ migrate, but set force anyway,
+ * so that drivers know to ignore activity reports
+ * from the HW.
+ */
+ *force = true;
+ return true;
+ }
+
+ nh_res_time_set_deadline(unb_point, deadline);
+ }
+
+ nh_res_time_set_deadline(idle_point, deadline);
+ return false;
+}
+
+static bool nh_res_bucket_migrate(struct nh_res_table *res_table,
+ u16 bucket_index, bool notify,
+ bool notify_nl, bool force)
+{
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
+ struct nh_grp_entry *new_nhge;
+ struct netlink_ext_ack extack;
+ int err;
+
+ new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries,
+ struct nh_grp_entry,
+ res.uw_nh_entry);
+ if (WARN_ON_ONCE(!new_nhge))
+ /* If this function is called, "bucket" is either not
+ * occupied, or it belongs to a next hop that is
+ * overweight. In either case, there ought to be a
+ * corresponding underweight next hop.
+ */
+ return false;
+
+ if (notify) {
+ struct nh_grp_entry *old_nhge;
+
+ old_nhge = nh_res_dereference(bucket->nh_entry);
+ err = call_nexthop_res_bucket_notifiers(res_table->net,
+ res_table->nhg_id,
+ bucket_index, force,
+ old_nhge->nh,
+ new_nhge->nh, &extack);
+ if (err) {
+ pr_err_ratelimited("%s\n", extack._msg);
+ if (!force)
+ return false;
+ /* It is not possible to veto a forced replacement, so
+ * just clear the hardware flags from the nexthop
+ * bucket to indicate to user space that this bucket is
+ * not correctly populated in hardware.
+ */
+ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+ }
+ }
+
+ nh_res_bucket_set_nh(bucket, new_nhge);
+ nh_res_bucket_set_idle(res_table, bucket);
+
+ if (notify_nl)
+ nexthop_bucket_notify(res_table, bucket_index);
+
+ if (nh_res_nhge_is_balanced(new_nhge))
+ list_del(&new_nhge->res.uw_nh_entry);
+ return true;
+}
+
+#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)
+
+static void nh_res_table_upkeep(struct nh_res_table *res_table,
+ bool notify, bool notify_nl)
+{
+ unsigned long now = jiffies;
+ unsigned long deadline;
+ u16 i;
+
+ /* Deadline is the next time that upkeep should be run. It is the
+ * earliest time at which one of the buckets might be migrated.
+ * Start at the most pessimistic estimate: either unbalanced_timer
+ * from now, or if there is none, idle_timer from now. For each
+ * encountered time point, call nh_res_time_set_deadline() to
+ * refine the estimate.
+ */
+ if (res_table->unbalanced_timer)
+ deadline = now + res_table->unbalanced_timer;
+ else
+ deadline = now + res_table->idle_timer;
+
+ for (i = 0; i < res_table->num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ bool force;
+
+ if (nh_res_bucket_should_migrate(res_table, bucket,
+ &deadline, &force)) {
+ if (!nh_res_bucket_migrate(res_table, i, notify,
+ notify_nl, force)) {
+ unsigned long idle_point;
+
+ /* A driver can override the migration
+ * decision if the HW reports that the
+ * bucket is actually not idle. Therefore
+ * remark the bucket as busy again and
+ * update the deadline.
+ */
+ nh_res_bucket_set_busy(bucket);
+ idle_point = nh_res_bucket_idle_point(res_table,
+ bucket,
+ now);
+ nh_res_time_set_deadline(idle_point, &deadline);
+ }
+ }
+ }
+
+ /* If the group is still unbalanced, schedule the next upkeep to
+ * either the deadline computed above, or the minimum deadline,
+ * whichever comes later.
+ */
+ if (!nh_res_table_is_balanced(res_table)) {
+ unsigned long now = jiffies;
+ unsigned long min_deadline;
+
+ min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL;
+ if (time_before(deadline, min_deadline))
+ deadline = min_deadline;
+
+ queue_delayed_work(system_power_efficient_wq,
+ &res_table->upkeep_dw, deadline - now);
+ }
+}
+
+static void nh_res_table_upkeep_dw(struct work_struct *work)
+{
+ struct delayed_work *dw = to_delayed_work(work);
+ struct nh_res_table *res_table;
+
+ res_table = container_of(dw, struct nh_res_table, upkeep_dw);
+ nh_res_table_upkeep(res_table, true, true);
+}
+
+static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
+{
+ cancel_delayed_work_sync(&res_table->upkeep_dw);
+}
+
+static void nh_res_group_rebalance(struct nh_group *nhg,
+ struct nh_res_table *res_table)
+{
+ int prev_upper_bound = 0;
+ int total = 0;
+ int w = 0;
+ int i;
+
+ INIT_LIST_HEAD(&res_table->uw_nh_entries);
+
+ for (i = 0; i < nhg->num_nh; ++i)
+ total += nhg->nh_entries[i].weight;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ int upper_bound;
+
+ w += nhge->weight;
+ upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w,
+ total);
+ nhge->res.wants_buckets = upper_bound - prev_upper_bound;
+ prev_upper_bound = upper_bound;
+
+ if (nh_res_nhge_is_uw(nhge)) {
+ if (list_empty(&res_table->uw_nh_entries))
+ res_table->unbalanced_since = jiffies;
+ list_add(&nhge->res.uw_nh_entry,
+ &res_table->uw_nh_entries);
+ }
+ }
+}
+
+/* Migrate buckets in res_table so that they reference NHGE's from NHG with
+ * the right NH ID. Set those buckets that do not have a corresponding NHGE
+ * entry in NHG as not occupied.
+ */
+static void nh_res_table_migrate_buckets(struct nh_res_table *res_table,
+ struct nh_group *nhg)
+{
+ u16 i;
+
+ for (i = 0; i < res_table->num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ u32 id = rtnl_dereference(bucket->nh_entry)->nh->id;
+ bool found = false;
+ int j;
+
+ for (j = 0; j < nhg->num_nh; j++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[j];
+
+ if (nhge->nh->id == id) {
+ nh_res_bucket_set_nh(bucket, nhge);
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ nh_res_bucket_unset_nh(bucket);
+ }
+}
+
+static void replace_nexthop_grp_res(struct nh_group *oldg,
+ struct nh_group *newg)
+{
+ /* For NH group replacement, the new NHG might only have a stub
+ * hash table with 0 buckets, because the number of buckets was not
+ * specified. For NH removal, oldg and newg both reference the same
+ * res_table. So in any case, in the following, we want to work
+ * with oldg->res_table.
+ */
+ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table);
+ unsigned long prev_unbalanced_since = old_res_table->unbalanced_since;
+ bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries);
+
+ nh_res_table_cancel_upkeep(old_res_table);
+ nh_res_table_migrate_buckets(old_res_table, newg);
+ nh_res_group_rebalance(newg, old_res_table);
+ if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries))
+ old_res_table->unbalanced_since = prev_unbalanced_since;
+ nh_res_table_upkeep(old_res_table, true, false);
+}
+
+static void nh_mp_group_rebalance(struct nh_group *nhg)
{
int total = 0;
int w = 0;
@@ -965,7 +1751,9 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
}
newg->has_v4 = false;
+ newg->is_multipath = nhg->is_multipath;
newg->mpath = nhg->mpath;
+ newg->resilient = nhg->resilient;
newg->fdb_nh = nhg->fdb_nh;
newg->num_nh = nhg->num_nh;
@@ -993,15 +1781,25 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
j++;
}
- nh_group_rebalance(newg);
+ if (newg->mpath)
+ nh_mp_group_rebalance(newg);
+ else if (newg->resilient)
+ replace_nexthop_grp_res(nhg, newg);
+
rcu_assign_pointer(nhp->nh_grp, newg);
list_del(&nhge->nh_list);
nexthop_put(nhge->nh);
- err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack);
- if (err)
- pr_err("%s\n", extack._msg);
+ /* Removal of a NH from a resilient group is notified through
+ * bucket notifications.
+ */
+ if (newg->mpath) {
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
+ &extack);
+ if (err)
+ pr_err("%s\n", extack._msg);
+ }
if (nlinfo)
nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
@@ -1022,6 +1820,7 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
{
struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
+ struct nh_res_table *res_table;
int i, num_nh = nhg->num_nh;
for (i = 0; i < num_nh; ++i) {
@@ -1032,6 +1831,11 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
list_del_init(&nhge->nh_list);
}
+
+ if (nhg->resilient) {
+ res_table = rtnl_dereference(nhg->res_table);
+ nh_res_table_cancel_upkeep(res_table);
+ }
}
/* not called for nexthop replace */
@@ -1107,9 +1911,12 @@ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh)
}
static int replace_nexthop_grp(struct net *net, struct nexthop *old,
- struct nexthop *new,
+ struct nexthop *new, const struct nh_config *cfg,
struct netlink_ext_ack *extack)
{
+ struct nh_res_table *tmp_table = NULL;
+ struct nh_res_table *new_res_table;
+ struct nh_res_table *old_res_table;
struct nh_group *oldg, *newg;
int i, err;
@@ -1118,19 +1925,67 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old,
return -EINVAL;
}
- err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
- if (err)
- return err;
-
oldg = rtnl_dereference(old->nh_grp);
newg = rtnl_dereference(new->nh_grp);
+ if (newg->mpath != oldg->mpath) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type.");
+ return -EINVAL;
+ }
+
+ if (newg->mpath) {
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new,
+ extack);
+ if (err)
+ return err;
+ } else if (newg->resilient) {
+ new_res_table = rtnl_dereference(newg->res_table);
+ old_res_table = rtnl_dereference(oldg->res_table);
+
+ /* Accept if num_nh_buckets was not given, but if it was
+ * given, demand that the value be correct.
+ */
+ if (cfg->nh_grp_res_has_num_buckets &&
+ cfg->nh_grp_res_num_buckets !=
+ old_res_table->num_nh_buckets) {
+ NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group.");
+ return -EINVAL;
+ }
+
+ /* Emit a pre-replace notification so that listeners could veto
+ * a potentially unsupported configuration. Otherwise,
+ * individual bucket replacement notifications would need to be
+ * vetoed, which is something that should only happen if the
+ * bucket is currently active.
+ */
+ err = call_nexthop_res_table_notifiers(net, new, extack);
+ if (err)
+ return err;
+
+ if (cfg->nh_grp_res_has_idle_timer)
+ old_res_table->idle_timer = cfg->nh_grp_res_idle_timer;
+ if (cfg->nh_grp_res_has_unbalanced_timer)
+ old_res_table->unbalanced_timer =
+ cfg->nh_grp_res_unbalanced_timer;
+
+ replace_nexthop_grp_res(oldg, newg);
+
+ tmp_table = new_res_table;
+ rcu_assign_pointer(newg->res_table, old_res_table);
+ rcu_assign_pointer(newg->spare->res_table, old_res_table);
+ }
+
/* update parents - used by nexthop code for cleanup */
for (i = 0; i < newg->num_nh; i++)
newg->nh_entries[i].nh_parent = old;
rcu_assign_pointer(old->nh_grp, newg);
+ if (newg->resilient) {
+ rcu_assign_pointer(oldg->res_table, tmp_table);
+ rcu_assign_pointer(oldg->spare->res_table, tmp_table);
+ }
+
for (i = 0; i < oldg->num_nh; i++)
oldg->nh_entries[i].nh_parent = new;
@@ -1156,6 +2011,71 @@ static void nh_group_v4_update(struct nh_group *nhg)
nhg->has_v4 = has_v4;
}
+static int replace_nexthop_single_notify_res(struct net *net,
+ struct nh_res_table *res_table,
+ struct nexthop *old,
+ struct nh_info *oldi,
+ struct nh_info *newi,
+ struct netlink_ext_ack *extack)
+{
+ u32 nhg_id = res_table->nhg_id;
+ int err;
+ u16 i;
+
+ for (i = 0; i < res_table->num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ struct nh_grp_entry *nhge;
+
+ nhge = rtnl_dereference(bucket->nh_entry);
+ if (nhge->nh == old) {
+ err = __call_nexthop_res_bucket_notifiers(net, nhg_id,
+ i, true,
+ oldi, newi,
+ extack);
+ if (err)
+ goto err_notify;
+ }
+ }
+
+ return 0;
+
+err_notify:
+ while (i-- > 0) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ struct nh_grp_entry *nhge;
+
+ nhge = rtnl_dereference(bucket->nh_entry);
+ if (nhge->nh == old)
+ __call_nexthop_res_bucket_notifiers(net, nhg_id, i,
+ true, newi, oldi,
+ extack);
+ }
+ return err;
+}
+
+static int replace_nexthop_single_notify(struct net *net,
+ struct nexthop *group_nh,
+ struct nexthop *old,
+ struct nh_info *oldi,
+ struct nh_info *newi,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp);
+ struct nh_res_table *res_table;
+
+ if (nhg->mpath) {
+ return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE,
+ group_nh, extack);
+ } else if (nhg->resilient) {
+ res_table = rtnl_dereference(nhg->res_table);
+ return replace_nexthop_single_notify_res(net, res_table,
+ old, oldi, newi,
+ extack);
+ }
+
+ return -EINVAL;
+}
+
static int replace_nexthop_single(struct net *net, struct nexthop *old,
struct nexthop *new,
struct netlink_ext_ack *extack)
@@ -1198,8 +2118,8 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old,
list_for_each_entry(nhge, &old->grp_list, nh_list) {
struct nexthop *nhp = nhge->nh_parent;
- err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
- extack);
+ err = replace_nexthop_single_notify(net, nhp, old, oldi, newi,
+ extack);
if (err)
goto err_notify;
}
@@ -1229,7 +2149,7 @@ err_notify:
list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) {
struct nexthop *nhp = nhge->nh_parent;
- call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, extack);
+ replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL);
}
call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack);
return err;
@@ -1276,7 +2196,8 @@ static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
}
static int replace_nexthop(struct net *net, struct nexthop *old,
- struct nexthop *new, struct netlink_ext_ack *extack)
+ struct nexthop *new, const struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
{
bool new_is_reject = false;
struct nh_grp_entry *nhge;
@@ -1319,7 +2240,7 @@ static int replace_nexthop(struct net *net, struct nexthop *old,
}
if (old->is_group)
- err = replace_nexthop_grp(net, old, new, extack);
+ err = replace_nexthop_grp(net, old, new, cfg, extack);
else
err = replace_nexthop_single(net, old, new, extack);
@@ -1361,7 +2282,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh,
} else if (new_id > nh->id) {
pp = &next->rb_right;
} else if (replace) {
- rc = replace_nexthop(net, nh, new_nh, extack);
+ rc = replace_nexthop(net, nh, new_nh, cfg, extack);
if (!rc) {
new_nh = nh; /* send notification with old nh */
replace_notify = 1;
@@ -1379,9 +2300,37 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh,
goto out;
}
+ if (new_nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp);
+ struct nh_res_table *res_table;
+
+ if (nhg->resilient) {
+ res_table = rtnl_dereference(nhg->res_table);
+
+ /* Not passing the number of buckets is OK when
+ * replacing, but not when creating a new group.
+ */
+ if (!cfg->nh_grp_res_has_num_buckets) {
+ NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ nh_res_group_rebalance(nhg, res_table);
+
+ /* Do not send bucket notifications, we do full
+ * notification below.
+ */
+ nh_res_table_upkeep(res_table, false, false);
+ }
+ }
+
rb_link_node_rcu(&new_nh->rb_node, parent, pp);
rb_insert_color(&new_nh->rb_node, root);
+ /* The initial insertion is a full notification for mpath as well
+ * as resilient groups.
+ */
rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack);
if (rc)
rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);
@@ -1441,6 +2390,7 @@ static struct nexthop *nexthop_create_group(struct net *net,
u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
struct nh_group *nhg;
struct nexthop *nh;
+ int err;
int i;
if (WARN_ON(!num_nh))
@@ -1472,8 +2422,10 @@ static struct nexthop *nexthop_create_group(struct net *net,
struct nh_info *nhi;
nhe = nexthop_find_by_id(net, entry[i].id);
- if (!nexthop_get(nhe))
+ if (!nexthop_get(nhe)) {
+ err = -ENOENT;
goto out_no_nh;
+ }
nhi = rtnl_dereference(nhe->nh_info);
if (nhi->family == AF_INET)
@@ -1485,13 +2437,28 @@ static struct nexthop *nexthop_create_group(struct net *net,
nhg->nh_entries[i].nh_parent = nh;
}
- if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH)
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
nhg->mpath = 1;
+ nhg->is_multipath = true;
+ } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) {
+ struct nh_res_table *res_table;
+
+ res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg);
+ if (!res_table) {
+ err = -ENOMEM;
+ goto out_no_nh;
+ }
+
+ rcu_assign_pointer(nhg->spare->res_table, res_table);
+ rcu_assign_pointer(nhg->res_table, res_table);
+ nhg->resilient = true;
+ nhg->is_multipath = true;
+ }
- WARN_ON_ONCE(nhg->mpath != 1);
+ WARN_ON_ONCE(nhg->mpath + nhg->resilient != 1);
if (nhg->mpath)
- nh_group_rebalance(nhg);
+ nh_mp_group_rebalance(nhg);
if (cfg->nh_fdb)
nhg->fdb_nh = 1;
@@ -1510,7 +2477,7 @@ out_no_nh:
kfree(nhg);
kfree(nh);
- return ERR_PTR(-ENOENT);
+ return ERR_PTR(err);
}
static int nh_create_ipv4(struct net *net, struct nexthop *nh,
@@ -1680,6 +2647,70 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
return nh;
}
+static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback,
+ unsigned long *timer_p, bool *has_p,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long timer;
+ u32 value;
+
+ if (!attr) {
+ *timer_p = fallback;
+ *has_p = false;
+ return 0;
+ }
+
+ value = nla_get_u32(attr);
+ timer = clock_t_to_jiffies(value);
+ if (timer == ~0UL) {
+ NL_SET_ERR_MSG(extack, "Timer value too large");
+ return -EINVAL;
+ }
+
+ *timer_p = timer;
+ *has_p = true;
+ return 0;
+}
+
+static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {};
+ int err;
+
+ if (res) {
+ err = nla_parse_nested(tb,
+ ARRAY_SIZE(rtm_nh_res_policy_new) - 1,
+ res, rtm_nh_res_policy_new, extack);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[NHA_RES_GROUP_BUCKETS]) {
+ cfg->nh_grp_res_num_buckets =
+ nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]);
+ cfg->nh_grp_res_has_num_buckets = true;
+ if (!cfg->nh_grp_res_num_buckets) {
+ NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0");
+ return -EINVAL;
+ }
+ }
+
+ err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER],
+ NH_RES_DEFAULT_IDLE_TIMER,
+ &cfg->nh_grp_res_idle_timer,
+ &cfg->nh_grp_res_has_idle_timer,
+ extack);
+ if (err)
+ return err;
+
+ return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER],
+ NH_RES_DEFAULT_UNBALANCED_TIMER,
+ &cfg->nh_grp_res_unbalanced_timer,
+ &cfg->nh_grp_res_has_unbalanced_timer,
+ extack);
+}
+
static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
struct nlmsghdr *nlh, struct nh_config *cfg,
struct netlink_ext_ack *extack)
@@ -1758,7 +2789,14 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
NL_SET_ERR_MSG(extack, "Invalid group type");
goto out;
}
- err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), extack);
+ err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb),
+ cfg->nh_grp_type, extack);
+ if (err)
+ goto out;
+
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES)
+ err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP],
+ cfg, extack);
/* no other attributes should be set */
goto out;
@@ -1983,10 +3021,12 @@ errout_free:
}
struct nh_dump_filter {
+ u32 nh_id;
int dev_idx;
int master_idx;
bool group_filter;
bool fdb_filter;
+ u32 res_bucket_nh_id;
};
static bool nh_dump_filtered(struct nexthop *nh,
@@ -2166,6 +3206,318 @@ out_err:
return err;
}
+static struct nexthop *
+nexthop_find_group_resilient(struct net *net, u32 id,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_group *nhg;
+ struct nexthop *nh;
+
+ nh = nexthop_find_by_id(net, id);
+ if (!nh)
+ return ERR_PTR(-ENOENT);
+
+ if (!nh->is_group) {
+ NL_SET_ERR_MSG(extack, "Not a nexthop group");
+ return ERR_PTR(-EINVAL);
+ }
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (!nhg->resilient) {
+ NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return nh;
+}
+
+static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p,
+ struct netlink_ext_ack *extack)
+{
+ u32 idx;
+
+ if (attr) {
+ idx = nla_get_u32(attr);
+ if (!idx) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+ *nh_id_p = idx;
+ } else {
+ *nh_id_p = 0;
+ }
+
+ return 0;
+}
+
+static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh,
+ struct nh_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)];
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1,
+ rtm_nh_policy_dump_bucket, NULL);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack);
+ if (err)
+ return err;
+
+ if (tb[NHA_RES_BUCKET]) {
+ size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1;
+
+ err = nla_parse_nested(res_tb, max,
+ tb[NHA_RES_BUCKET],
+ rtm_nh_res_bucket_policy_dump,
+ cb->extack);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID],
+ &filter->res_bucket_nh_id,
+ cb->extack);
+ if (err)
+ return err;
+ }
+
+ return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
+}
+
+struct rtm_dump_res_bucket_ctx {
+ struct rtm_dump_nh_ctx nh;
+ u16 bucket_index;
+ u32 done_nh_idx; /* 1 + the index of the last fully processed NH. */
+};
+
+static struct rtm_dump_res_bucket_ctx *
+rtm_dump_res_bucket_ctx(struct netlink_callback *cb)
+{
+ struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+ return ctx;
+}
+
+struct rtm_dump_nexthop_bucket_data {
+ struct rtm_dump_res_bucket_ctx *ctx;
+ struct nh_dump_filter filter;
+};
+
+static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nexthop *nh,
+ struct rtm_dump_nexthop_bucket_data *dd)
+{
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ struct nhmsg *nhm = nlmsg_data(cb->nlh);
+ struct nh_res_table *res_table;
+ struct nh_group *nhg;
+ u16 bucket_index;
+ int err;
+
+ if (dd->ctx->nh.idx < dd->ctx->done_nh_idx)
+ return 0;
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ res_table = rtnl_dereference(nhg->res_table);
+ for (bucket_index = dd->ctx->bucket_index;
+ bucket_index < res_table->num_nh_buckets;
+ bucket_index++) {
+ struct nh_res_bucket *bucket;
+ struct nh_grp_entry *nhge;
+
+ bucket = &res_table->nh_buckets[bucket_index];
+ nhge = rtnl_dereference(bucket->nh_entry);
+ if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family))
+ continue;
+
+ if (dd->filter.res_bucket_nh_id &&
+ dd->filter.res_bucket_nh_id != nhge->nh->id)
+ continue;
+
+ err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
+ RTM_NEWNEXTHOPBUCKET, portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->extack);
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+ goto out_err;
+ }
+ }
+
+ dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1;
+ bucket_index = 0;
+
+out:
+ err = skb->len;
+out_err:
+ dd->ctx->bucket_index = bucket_index;
+ return err;
+}
+
+static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nexthop *nh, void *data)
+{
+ struct rtm_dump_nexthop_bucket_data *dd = data;
+ struct nh_group *nhg;
+
+ if (!nh->is_group)
+ return 0;
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (!nhg->resilient)
+ return 0;
+
+ return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd);
+}
+
+/* rtnl */
+static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb);
+ struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx };
+ struct net *net = sock_net(skb->sk);
+ struct nexthop *nh;
+ int err;
+
+ err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb);
+ if (err)
+ return err;
+
+ if (dd.filter.nh_id) {
+ nh = nexthop_find_group_resilient(net, dd.filter.nh_id,
+ cb->extack);
+ if (IS_ERR(nh))
+ return PTR_ERR(nh);
+ err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd);
+ } else {
+ struct rb_root *root = &net->nexthop.rb_root;
+
+ err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh,
+ &rtm_dump_nexthop_bucket_cb, &dd);
+ }
+
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+ goto out_err;
+ }
+
+out:
+ err = skb->len;
+out_err:
+ cb->seq = net->nexthop.seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ return err;
+}
+
+static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res,
+ u16 *bucket_index,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)];
+ int err;
+
+ err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1,
+ res, rtm_nh_res_bucket_policy_get, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[NHA_RES_BUCKET_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Bucket index is missing");
+ return -EINVAL;
+ }
+
+ *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]);
+ return 0;
+}
+
+static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
+ u32 *id, u16 *bucket_index,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1,
+ rtm_nh_policy_get_bucket, extack);
+ if (err < 0)
+ return err;
+
+ err = __nh_valid_get_del_req(nlh, tb, id, extack);
+ if (err)
+ return err;
+
+ if (!tb[NHA_RES_BUCKET]) {
+ NL_SET_ERR_MSG(extack, "Bucket information is missing");
+ return -EINVAL;
+ }
+
+ err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET],
+ bucket_index, extack);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nh_res_table *res_table;
+ struct sk_buff *skb = NULL;
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ u16 bucket_index;
+ int err;
+ u32 id;
+
+ err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack);
+ if (err)
+ return err;
+
+ nh = nexthop_find_group_resilient(net, id, extack);
+ if (IS_ERR(nh))
+ return PTR_ERR(nh);
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ res_table = rtnl_dereference(nhg->res_table);
+ if (bucket_index >= res_table->num_nh_buckets) {
+ NL_SET_ERR_MSG(extack, "Bucket index out of bounds");
+ return -ENOENT;
+ }
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index],
+ bucket_index, RTM_NEWNEXTHOPBUCKET,
+ NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+ 0, extack);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ goto errout_free;
+ }
+
+ return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+errout_free:
+ kfree_skb(skb);
+ return err;
+}
+
static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
{
unsigned int hash = nh_dev_hashfn(dev->ifindex);
@@ -2277,6 +3629,75 @@ out:
}
EXPORT_SYMBOL(nexthop_set_hw_flags);
+void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
+ bool offload, bool trap)
+{
+ struct nh_res_table *res_table;
+ struct nh_res_bucket *bucket;
+ struct nexthop *nexthop;
+ struct nh_group *nhg;
+
+ rcu_read_lock();
+
+ nexthop = nexthop_find_by_id(net, id);
+ if (!nexthop || !nexthop->is_group)
+ goto out;
+
+ nhg = rcu_dereference(nexthop->nh_grp);
+ if (!nhg->resilient)
+ goto out;
+
+ if (bucket_index >= nhg->res_table->num_nh_buckets)
+ goto out;
+
+ res_table = rcu_dereference(nhg->res_table);
+ bucket = &res_table->nh_buckets[bucket_index];
+ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+ if (offload)
+ bucket->nh_flags |= RTNH_F_OFFLOAD;
+ if (trap)
+ bucket->nh_flags |= RTNH_F_TRAP;
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_bucket_set_hw_flags);
+
+void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
+ unsigned long *activity)
+{
+ struct nh_res_table *res_table;
+ struct nexthop *nexthop;
+ struct nh_group *nhg;
+ u16 i;
+
+ rcu_read_lock();
+
+ nexthop = nexthop_find_by_id(net, id);
+ if (!nexthop || !nexthop->is_group)
+ goto out;
+
+ nhg = rcu_dereference(nexthop->nh_grp);
+ if (!nhg->resilient)
+ goto out;
+
+ /* Instead of silently ignoring some buckets, demand that the sizes
+ * be the same.
+ */
+ res_table = rcu_dereference(nhg->res_table);
+ if (num_buckets != res_table->num_nh_buckets)
+ goto out;
+
+ for (i = 0; i < num_buckets; i++) {
+ if (test_bit(i, activity))
+ nh_res_bucket_set_busy(&res_table->nh_buckets[i]);
+ }
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_res_grp_activity_update);
+
static void __net_exit nexthop_net_exit(struct net *net)
{
rtnl_lock();
@@ -2320,6 +3741,9 @@ static int __init nexthop_init(void)
rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket,
+ rtm_dump_nexthop_bucket, 0);
+
return 0;
}
subsys_initcall(nexthop_init);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bba150fdd265..f6787c55f6ab 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -21,7 +21,7 @@
* Alan Cox : Added BSD route gw semantics
* Alan Cox : Super /proc >4K
* Alan Cox : MTU in route table
- * Alan Cox : MSS actually. Also added the window
+ * Alan Cox : MSS actually. Also added the window
* clamper.
* Sam Lantinga : Fixed route matching in rt_del()
* Alan Cox : Routing cache support.
@@ -41,7 +41,7 @@
* Olaf Erb : irtt wasn't being copied right.
* Bjorn Ekwall : Kerneld route support.
* Alan Cox : Multicast fixed (I hope)
- * Pavel Krauz : Limited broadcast fixed
+ * Pavel Krauz : Limited broadcast fixed
* Mike McLagan : Routing by source
* Alexey Kuznetsov : End of old history. Split to fib.c and
* route.c and rewritten from scratch.
@@ -54,8 +54,8 @@
* Robert Olsson : Added rt_cache statistics
* Arnaldo C. Melo : Convert proc stuff to seq_file
* Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
- * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
- * Ilia Sotnikov : Removed TOS from hash calculations
+ * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
+ * Ilia Sotnikov : Removed TOS from hash calculations
*/
#define pr_fmt(fmt) "IPv4: " fmt
@@ -66,6 +66,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/memblock.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -234,19 +235,6 @@ static const struct seq_operations rt_cache_seq_ops = {
.show = rt_cache_seq_show,
};
-static int rt_cache_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &rt_cache_seq_ops);
-}
-
-static const struct proc_ops rt_cache_proc_ops = {
- .proc_open = rt_cache_seq_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = seq_release,
-};
-
-
static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
int cpu;
@@ -324,19 +312,6 @@ static const struct seq_operations rt_cpu_seq_ops = {
.show = rt_cpu_seq_show,
};
-
-static int rt_cpu_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &rt_cpu_seq_ops);
-}
-
-static const struct proc_ops rt_cpu_proc_ops = {
- .proc_open = rt_cpu_seq_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = seq_release,
-};
-
#ifdef CONFIG_IP_ROUTE_CLASSID
static int rt_acct_proc_show(struct seq_file *m, void *v)
{
@@ -367,13 +342,13 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
{
struct proc_dir_entry *pde;
- pde = proc_create("rt_cache", 0444, net->proc_net,
- &rt_cache_proc_ops);
+ pde = proc_create_seq("rt_cache", 0444, net->proc_net,
+ &rt_cache_seq_ops);
if (!pde)
goto err1;
- pde = proc_create("rt_cache", 0444,
- net->proc_net_stat, &rt_cpu_proc_ops);
+ pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
+ &rt_cpu_seq_ops);
if (!pde)
goto err2;
@@ -478,8 +453,10 @@ static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}
-#define IP_IDENTS_SZ 2048u
-
+/* Hash tables of size 2048..262144 depending on RAM size.
+ * Each bucket uses 8 bytes.
+ */
+static u32 ip_idents_mask __read_mostly;
static atomic_t *ip_idents __read_mostly;
static u32 *ip_tstamps __read_mostly;
@@ -489,12 +466,16 @@ static u32 *ip_tstamps __read_mostly;
*/
u32 ip_idents_reserve(u32 hash, int segs)
{
- u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
- atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
- u32 old = READ_ONCE(*p_tstamp);
- u32 now = (u32)jiffies;
+ u32 bucket, old, now = (u32)jiffies;
+ atomic_t *p_id;
+ u32 *p_tstamp;
u32 delta = 0;
+ bucket = hash & ip_idents_mask;
+ p_tstamp = ip_tstamps + bucket;
+ p_id = ip_idents + bucket;
+ old = READ_ONCE(*p_tstamp);
+
if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
@@ -722,6 +703,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
for_each_possible_cpu(i) {
struct rtable __rcu **prt;
+
prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
rt = rcu_dereference(*prt);
if (rt)
@@ -1258,12 +1240,12 @@ static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
}
/*
- We do not cache source address of outgoing interface,
- because it is used only by IP RR, TS and SRR options,
- so that it out of fast path.
-
- BTW remember: "addr" is allowed to be not aligned
- in IP options!
+ * We do not cache source address of outgoing interface,
+ * because it is used only by IP RR, TS and SRR options,
+ * so that it out of fast path.
+ *
+ * BTW remember: "addr" is allowed to be not aligned
+ * in IP options!
*/
void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
@@ -2108,7 +2090,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto out;
/* Check for the most weird martians, which can be not detected
- by fib_lookup.
+ * by fib_lookup.
*/
tun_info = skb_tunnel_info(skb);
@@ -2246,7 +2228,7 @@ local_input:
if (res->type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
rth->dst.error= -err;
- rth->rt_flags &= ~RTCF_LOCAL;
+ rth->rt_flags &= ~RTCF_LOCAL;
}
if (do_cache) {
@@ -2317,15 +2299,15 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, struct fib_result *res)
{
/* Multicast recognition logic is moved from route cache to here.
- The problem was that too many Ethernet cards have broken/missing
- hardware multicast filters :-( As result the host on multicasting
- network acquires a lot of useless route cache entries, sort of
- SDR messages from all the world. Now we try to get rid of them.
- Really, provided software IP multicast filter is organized
- reasonably (at least, hashed), it does not result in a slowdown
- comparing with route cache reject entries.
- Note, that multicast routers are not affected, because
- route cache entry is created eventually.
+ * The problem was that too many Ethernet cards have broken/missing
+ * hardware multicast filters :-( As result the host on multicasting
+ * network acquires a lot of useless route cache entries, sort of
+ * SDR messages from all the world. Now we try to get rid of them.
+ * Really, provided software IP multicast filter is organized
+ * reasonably (at least, hashed), it does not result in a slowdown
+ * comparing with route cache reject entries.
+ * Note, that multicast routers are not affected, because
+ * route cache entry is created eventually.
*/
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -2537,11 +2519,11 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
rth = ERR_PTR(-ENETUNREACH);
/* I removed check for oif == dev_out->oif here.
- It was wrong for two reasons:
- 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
- is assigned to multiple interfaces.
- 2. Moreover, we are allowed to send packets with saddr
- of another iface. --ANK
+ * It was wrong for two reasons:
+ * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
+ * is assigned to multiple interfaces.
+ * 2. Moreover, we are allowed to send packets with saddr
+ * of another iface. --ANK
*/
if (fl4->flowi4_oif == 0 &&
@@ -2553,18 +2535,18 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
goto out;
/* Special hack: user can direct multicasts
- and limited broadcast via necessary interface
- without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
- This hack is not just for fun, it allows
- vic,vat and friends to work.
- They bind socket to loopback, set ttl to zero
- and expect that it will work.
- From the viewpoint of routing cache they are broken,
- because we are not allowed to build multicast path
- with loopback source addr (look, routing cache
- cannot know, that ttl is zero, so that packet
- will not leave this host and route is valid).
- Luckily, this hack is good workaround.
+ * and limited broadcast via necessary interface
+ * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+ * This hack is not just for fun, it allows
+ * vic,vat and friends to work.
+ * They bind socket to loopback, set ttl to zero
+ * and expect that it will work.
+ * From the viewpoint of routing cache they are broken,
+ * because we are not allowed to build multicast path
+ * with loopback source addr (look, routing cache
+ * cannot know, that ttl is zero, so that packet
+ * will not leave this host and route is valid).
+ * Luckily, this hack is good workaround.
*/
fl4->flowi4_oif = dev_out->ifindex;
@@ -2627,21 +2609,21 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
(ipv4_is_multicast(fl4->daddr) ||
!netif_index_is_l3_master(net, fl4->flowi4_oif))) {
/* Apparently, routing tables are wrong. Assume,
- that the destination is on link.
-
- WHY? DW.
- Because we are allowed to send to iface
- even if it has NO routes and NO assigned
- addresses. When oif is specified, routing
- tables are looked up with only one purpose:
- to catch if destination is gatewayed, rather than
- direct. Moreover, if MSG_DONTROUTE is set,
- we send packet, ignoring both routing tables
- and ifaddr state. --ANK
-
-
- We could make it even if oif is unknown,
- likely IPv6, but we do not.
+ * that the destination is on link.
+ *
+ * WHY? DW.
+ * Because we are allowed to send to iface
+ * even if it has NO routes and NO assigned
+ * addresses. When oif is specified, routing
+ * tables are looked up with only one purpose:
+ * to catch if destination is gatewayed, rather than
+ * direct. Moreover, if MSG_DONTROUTE is set,
+ * we send packet, ignoring both routing tables
+ * and ifaddr state. --ANK
+ *
+ *
+ * We could make it even if oif is unknown,
+ * likely IPv6, but we do not.
*/
if (fl4->saddr == 0)
@@ -3553,18 +3535,25 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
int __init ip_rt_init(void)
{
+ void *idents_hash;
int cpu;
- ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
- GFP_KERNEL);
- if (!ip_idents)
- panic("IP: failed to allocate ip_idents\n");
+ /* For modern hosts, this will use 2 MB of memory */
+ idents_hash = alloc_large_system_hash("IP idents",
+ sizeof(*ip_idents) + sizeof(*ip_tstamps),
+ 0,
+ 16, /* one bucket per 64 KB */
+ HASH_ZERO,
+ NULL,
+ &ip_idents_mask,
+ 2048,
+ 256*1024);
+
+ ip_idents = idents_hash;
- prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+ prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
- ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
- if (!ip_tstamps)
- panic("IP: failed to allocate ip_tstamps\n");
+ ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index bc7d2a586e18..17c322b875fd 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -229,7 +229,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
}
EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
-#ifdef CONFIG_BPF_STREAM_PARSER
+#ifdef CONFIG_BPF_SYSCALL
static bool tcp_bpf_stream_read(const struct sock *sk)
{
struct sk_psock *psock;
@@ -629,4 +629,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE])
newsk->sk_prot = sk->sk_prot_creator;
}
-#endif /* CONFIG_BPF_STREAM_PARSER */
+#endif /* CONFIG_BPF_SYSCALL */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 69a545db80d2..4cf4dd532d1c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2914,7 +2914,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
/* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
- WARN_ON(tp->retrans_out != 0);
+ WARN_ON(tp->retrans_out != 0 && !tp->syn_data);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
@@ -5994,11 +5994,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
else
tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
- skb_rbtree_walk_from(data) {
- if (__tcp_retransmit_skb(sk, data, 1))
- break;
- }
- tcp_rearm_rto(sk);
+ skb_rbtree_walk_from(data)
+ tcp_mark_skb_lost(sk, data);
+ tcp_xmit_retransmit_queue(sk);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
return true;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fbf140a770d8..bde781f46b41 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2775,13 +2775,17 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
* a packet is still in a qdisc or driver queue.
* In this case, there is very little point doing a retransmit !
*/
-static bool skb_still_in_host_queue(const struct sock *sk,
+static bool skb_still_in_host_queue(struct sock *sk,
const struct sk_buff *skb)
{
if (unlikely(skb_fclone_busy(sk, skb))) {
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
- return true;
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+ smp_mb__after_atomic();
+ if (skb_fclone_busy(sk, skb)) {
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ return true;
+ }
}
return false;
}
@@ -3147,14 +3151,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (icsk->icsk_mtup.probe_size)
icsk->icsk_mtup.probe_size = 0;
- /* Do not sent more than we queued. 1/4 is reserved for possible
- * copying overhead: fragmentation, tunneling, mangling etc.
- */
- if (refcount_read(&sk->sk_wmem_alloc) >
- min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
- sk->sk_sndbuf))
- return -EAGAIN;
-
if (skb_still_in_host_queue(sk, skb))
return -EBUSY;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 6126f8bf94b3..56e479d158b7 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -381,7 +381,7 @@ static int ipv6_srh_rcv(struct sk_buff *skb)
looped_back:
if (hdr->segments_left == 0) {
- if (hdr->nexthdr == NEXTHDR_IPV6) {
+ if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) {
int offset = (hdr->hdrlen + 1) << 3;
skb_postpull_rcsum(skb, skb_network_header(skb),
@@ -397,7 +397,8 @@ looped_back:
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->encapsulation = 0;
-
+ if (hdr->nexthdr == NEXTHDR_IPV4)
+ skb->protocol = htons(ETH_P_IP);
__skb_tunnel_rx(skb, skb->dev, net);
netif_rx(skb);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1056b0229ffd..ebb7519bec2a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2360,7 +2360,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
memset(&hash_keys, 0, sizeof(hash_keys));
- if (!flkeys) {
+ if (!flkeys) {
skb_flow_dissect_flow_keys(skb, &keys, flag);
flkeys = &keys;
}
@@ -2500,20 +2500,20 @@ struct dst_entry *ip6_route_output_flags(struct net *net,
struct flowi6 *fl6,
int flags)
{
- struct dst_entry *dst;
- struct rt6_info *rt6;
+ struct dst_entry *dst;
+ struct rt6_info *rt6;
- rcu_read_lock();
- dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
- rt6 = (struct rt6_info *)dst;
- /* For dst cached in uncached_list, refcnt is already taken. */
- if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
- dst = &net->ipv6.ip6_null_entry->dst;
- dst_hold(dst);
- }
- rcu_read_unlock();
+ rcu_read_lock();
+ dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
+ rt6 = (struct rt6_info *)dst;
+ /* For dst cached in uncached_list, refcnt is already taken. */
+ if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+ dst = &net->ipv6.ip6_null_entry->dst;
+ dst_hold(dst);
+ }
+ rcu_read_unlock();
- return dst;
+ return dst;
}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index c2a0c78e84d4..8936f48570fc 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -119,12 +119,12 @@ static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
return (struct seg6_local_lwt *)lwt->data;
}
-static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb)
+static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb, int flags)
{
struct ipv6_sr_hdr *srh;
int len, srhoff = 0;
- if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, &flags) < 0)
return NULL;
if (!pskb_may_pull(skb, srhoff + sizeof(*srh)))
@@ -152,13 +152,10 @@ static struct ipv6_sr_hdr *get_and_validate_srh(struct sk_buff *skb)
{
struct ipv6_sr_hdr *srh;
- srh = get_srh(skb);
+ srh = get_srh(skb, IP6_FH_F_SKIP_RH);
if (!srh)
return NULL;
- if (srh->segments_left == 0)
- return NULL;
-
#ifdef CONFIG_IPV6_SEG6_HMAC
if (!seg6_hmac_validate_skb(skb))
return NULL;
@@ -172,7 +169,7 @@ static bool decap_and_validate(struct sk_buff *skb, int proto)
struct ipv6_sr_hdr *srh;
unsigned int off = 0;
- srh = get_srh(skb);
+ srh = get_srh(skb, 0);
if (srh && srh->segments_left > 0)
return false;
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 203890e378cb..2ee20743cb41 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -802,7 +802,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
u16 version;
int length;
- /* UDP has verifed checksum */
+ /* UDP has verified checksum */
/* UDP always verifies the packet length. */
__skb_pull(skb, sizeof(struct udphdr));
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index 0511bbe4af7b..1078e14f1acf 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -122,8 +122,8 @@ static struct lapb_cb *lapb_create_cb(void)
timer_setup(&lapb->t1timer, NULL, 0);
timer_setup(&lapb->t2timer, NULL, 0);
- lapb->t1timer_stop = true;
- lapb->t2timer_stop = true;
+ lapb->t1timer_running = false;
+ lapb->t2timer_running = false;
lapb->t1 = LAPB_DEFAULT_T1;
lapb->t2 = LAPB_DEFAULT_T2;
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 0230b272b7d1..5be68869064d 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -40,7 +40,7 @@ void lapb_start_t1timer(struct lapb_cb *lapb)
lapb->t1timer.function = lapb_t1timer_expiry;
lapb->t1timer.expires = jiffies + lapb->t1;
- lapb->t1timer_stop = false;
+ lapb->t1timer_running = true;
add_timer(&lapb->t1timer);
}
@@ -51,25 +51,25 @@ void lapb_start_t2timer(struct lapb_cb *lapb)
lapb->t2timer.function = lapb_t2timer_expiry;
lapb->t2timer.expires = jiffies + lapb->t2;
- lapb->t2timer_stop = false;
+ lapb->t2timer_running = true;
add_timer(&lapb->t2timer);
}
void lapb_stop_t1timer(struct lapb_cb *lapb)
{
- lapb->t1timer_stop = true;
+ lapb->t1timer_running = false;
del_timer(&lapb->t1timer);
}
void lapb_stop_t2timer(struct lapb_cb *lapb)
{
- lapb->t2timer_stop = true;
+ lapb->t2timer_running = false;
del_timer(&lapb->t2timer);
}
int lapb_t1timer_running(struct lapb_cb *lapb)
{
- return timer_pending(&lapb->t1timer);
+ return lapb->t1timer_running;
}
static void lapb_t2timer_expiry(struct timer_list *t)
@@ -79,13 +79,14 @@ static void lapb_t2timer_expiry(struct timer_list *t)
spin_lock_bh(&lapb->lock);
if (timer_pending(&lapb->t2timer)) /* A new timer has been set up */
goto out;
- if (lapb->t2timer_stop) /* The timer has been stopped */
+ if (!lapb->t2timer_running) /* The timer has been stopped */
goto out;
if (lapb->condition & LAPB_ACK_PENDING_CONDITION) {
lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
lapb_timeout_response(lapb);
}
+ lapb->t2timer_running = false;
out:
spin_unlock_bh(&lapb->lock);
@@ -98,7 +99,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
spin_lock_bh(&lapb->lock);
if (timer_pending(&lapb->t1timer)) /* A new timer has been set up */
goto out;
- if (lapb->t1timer_stop) /* The timer has been stopped */
+ if (!lapb->t1timer_running) /* The timer has been stopped */
goto out;
switch (lapb->state) {
@@ -127,6 +128,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb->state = LAPB_STATE_0;
lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev);
+ lapb->t1timer_running = false;
goto out;
} else {
lapb->n2count++;
@@ -151,6 +153,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb->state = LAPB_STATE_0;
lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev);
+ lapb->t1timer_running = false;
goto out;
} else {
lapb->n2count++;
@@ -169,6 +172,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb_stop_t2timer(lapb);
lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev);
+ lapb->t1timer_running = false;
goto out;
} else {
lapb->n2count++;
@@ -186,6 +190,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb->state = LAPB_STATE_0;
lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S4 -> S0\n", lapb->dev);
+ lapb->t1timer_running = false;
goto out;
} else {
lapb->n2count++;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 89a4225ed321..2b7eec93c9f5 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -26,6 +26,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
int expected_opsize;
u8 version;
u8 flags;
+ u8 i;
switch (subtype) {
case MPTCPOPT_MP_CAPABLE:
@@ -272,14 +273,17 @@ static void mptcp_parse_option(const struct sk_buff *skb,
break;
case MPTCPOPT_RM_ADDR:
- if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
+ if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 ||
+ opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX)
break;
ptr++;
mp_opt->rm_addr = 1;
- mp_opt->rm_id = *ptr++;
- pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
+ mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE;
+ for (i = 0; i < mp_opt->rm_list.nr; i++)
+ mp_opt->rm_list.ids[i] = *ptr++;
+ pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr);
break;
case MPTCPOPT_MP_PRIO:
@@ -676,20 +680,25 @@ static bool mptcp_established_options_rm_addr(struct sock *sk,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
- u8 rm_id;
+ struct mptcp_rm_list rm_list;
+ int i, len;
if (!mptcp_pm_should_rm_signal(msk) ||
- !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id)))
+ !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list)))
return false;
- if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE)
+ len = mptcp_rm_addr_len(&rm_list);
+ if (len < 0)
+ return false;
+ if (remaining < len)
return false;
- *size = TCPOLEN_MPTCP_RM_ADDR_BASE;
+ *size = len;
opts->suboptions |= OPTION_MPTCP_RM_ADDR;
- opts->rm_id = rm_id;
+ opts->rm_list = rm_list;
- pr_debug("rm_id=%d", opts->rm_id);
+ for (i = 0; i < opts->rm_list.nr; i++)
+ pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]);
return true;
}
@@ -1042,7 +1051,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
}
if (mp_opt.rm_addr) {
- mptcp_pm_rm_addr_received(msk, mp_opt.rm_id);
+ mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list);
mp_opt.rm_addr = 0;
}
@@ -1221,9 +1230,23 @@ mp_capable_done:
}
if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
+ u8 i = 1;
+
*ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
- TCPOLEN_MPTCP_RM_ADDR_BASE,
- 0, opts->rm_id);
+ TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr,
+ 0, opts->rm_list.ids[0]);
+
+ while (i < opts->rm_list.nr) {
+ u8 id1, id2, id3, id4;
+
+ id1 = opts->rm_list.ids[i];
+ id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP;
+ id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP;
+ id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP;
+ put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr);
+ ptr += 1;
+ i += 4;
+ }
}
if (OPTION_MPTCP_PRIO & opts->suboptions) {
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 6fd4b2c1b076..4cfd80f90003 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -39,29 +39,29 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk,
return 0;
}
-int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id)
+int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
{
u8 rm_addr = READ_ONCE(msk->pm.addr_signal);
- pr_debug("msk=%p, local_id=%d", msk, local_id);
+ pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
if (rm_addr) {
pr_warn("addr_signal error, rm_addr=%d", rm_addr);
return -EINVAL;
}
- msk->pm.rm_id = local_id;
+ msk->pm.rm_list_tx = *rm_list;
rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL);
WRITE_ONCE(msk->pm.addr_signal, rm_addr);
return 0;
}
-int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id)
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
{
- pr_debug("msk=%p, local_id=%d", msk, local_id);
+ pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
spin_lock_bh(&msk->pm.lock);
- mptcp_pm_nl_rm_subflow_received(msk, local_id);
+ mptcp_pm_nl_rm_subflow_received(msk, rm_list);
spin_unlock_bh(&msk->pm.lock);
return 0;
}
@@ -205,17 +205,20 @@ void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk)
mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK);
}
-void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id)
+void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list)
{
struct mptcp_pm_data *pm = &msk->pm;
+ u8 i;
- pr_debug("msk=%p remote_id=%d", msk, rm_id);
+ pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr);
- mptcp_event_addr_removed(msk, rm_id);
+ for (i = 0; i < rm_list->nr; i++)
+ mptcp_event_addr_removed(msk, rm_list->ids[i]);
spin_lock_bh(&pm->lock);
mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED);
- pm->rm_id = rm_id;
+ pm->rm_list_rx = *rm_list;
spin_unlock_bh(&pm->lock);
}
@@ -258,9 +261,9 @@ out_unlock:
}
bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
- u8 *rm_id)
+ struct mptcp_rm_list *rm_list)
{
- int ret = false;
+ int ret = false, len;
spin_lock_bh(&msk->pm.lock);
@@ -268,10 +271,15 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
if (!mptcp_pm_should_rm_signal(msk))
goto out_unlock;
- if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE)
+ len = mptcp_rm_addr_len(&msk->pm.rm_list_tx);
+ if (len < 0) {
+ WRITE_ONCE(msk->pm.addr_signal, 0);
+ goto out_unlock;
+ }
+ if (remaining < len)
goto out_unlock;
- *rm_id = msk->pm.rm_id;
+ *rm_list = msk->pm.rm_list_tx;
WRITE_ONCE(msk->pm.addr_signal, 0);
ret = true;
@@ -291,7 +299,8 @@ void mptcp_pm_data_init(struct mptcp_sock *msk)
msk->pm.add_addr_accepted = 0;
msk->pm.local_addr_used = 0;
msk->pm.subflows = 0;
- msk->pm.rm_id = 0;
+ msk->pm.rm_list_tx.nr = 0;
+ msk->pm.rm_list_rx.nr = 0;
WRITE_ONCE(msk->pm.work_pending, false);
WRITE_ONCE(msk->pm.addr_signal, 0);
WRITE_ONCE(msk->pm.accept_addr, false);
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 8e8e35fa4002..5857b82c88bf 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -575,36 +575,40 @@ static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *tmp;
struct sock *sk = (struct sock *)msk;
+ u8 i;
- pr_debug("address rm_id %d", msk->pm.rm_id);
+ pr_debug("address rm_list_nr %d", msk->pm.rm_list_rx.nr);
msk_owned_by_me(msk);
- if (!msk->pm.rm_id)
+ if (!msk->pm.rm_list_rx.nr)
return;
if (list_empty(&msk->conn_list))
return;
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
+ for (i = 0; i < msk->pm.rm_list_rx.nr; i++) {
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
- if (msk->pm.rm_id != subflow->remote_id)
- continue;
+ if (msk->pm.rm_list_rx.ids[i] != subflow->remote_id)
+ continue;
- spin_unlock_bh(&msk->pm.lock);
- mptcp_subflow_shutdown(sk, ssk, how);
- mptcp_close_ssk(sk, ssk, subflow);
- spin_lock_bh(&msk->pm.lock);
+ pr_debug(" -> address rm_list_ids[%d]=%u", i, msk->pm.rm_list_rx.ids[i]);
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, how);
+ mptcp_close_ssk(sk, ssk, subflow);
+ spin_lock_bh(&msk->pm.lock);
- msk->pm.add_addr_accepted--;
- msk->pm.subflows--;
- WRITE_ONCE(msk->pm.accept_addr, true);
+ msk->pm.add_addr_accepted--;
+ msk->pm.subflows--;
+ WRITE_ONCE(msk->pm.accept_addr, true);
- __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMADDR);
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMADDR);
- break;
+ break;
+ }
}
}
@@ -641,39 +645,44 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk)
spin_unlock_bh(&msk->pm.lock);
}
-void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id)
+void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list)
{
struct mptcp_subflow_context *subflow, *tmp;
struct sock *sk = (struct sock *)msk;
+ u8 i;
- pr_debug("subflow rm_id %d", rm_id);
+ pr_debug("subflow rm_list_nr %d", rm_list->nr);
msk_owned_by_me(msk);
- if (!rm_id)
+ if (!rm_list->nr)
return;
if (list_empty(&msk->conn_list))
return;
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
+ for (i = 0; i < rm_list->nr; i++) {
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
- if (rm_id != subflow->local_id)
- continue;
+ if (rm_list->ids[i] != subflow->local_id)
+ continue;
- spin_unlock_bh(&msk->pm.lock);
- mptcp_subflow_shutdown(sk, ssk, how);
- mptcp_close_ssk(sk, ssk, subflow);
- spin_lock_bh(&msk->pm.lock);
+ pr_debug(" -> subflow rm_list_ids[%d]=%u", i, rm_list->ids[i]);
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, how);
+ mptcp_close_ssk(sk, ssk, subflow);
+ spin_lock_bh(&msk->pm.lock);
- msk->pm.local_addr_used--;
- msk->pm.subflows--;
+ msk->pm.local_addr_used--;
+ msk->pm.subflows--;
- __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
- break;
+ break;
+ }
}
}
@@ -1071,12 +1080,15 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
struct mptcp_addr_info *addr,
bool force)
{
+ struct mptcp_rm_list list = { .nr = 0 };
bool ret;
+ list.ids[list.nr++] = addr->id;
+
ret = remove_anno_list_by_saddr(msk, addr);
if (ret || force) {
spin_lock_bh(&msk->pm.lock);
- mptcp_pm_remove_addr(msk, addr->id);
+ mptcp_pm_remove_addr(msk, &list);
spin_unlock_bh(&msk->pm.lock);
}
return ret;
@@ -1087,9 +1099,12 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
{
struct mptcp_sock *msk;
long s_slot = 0, s_num = 0;
+ struct mptcp_rm_list list = { .nr = 0 };
pr_debug("remove_id=%d", addr->id);
+ list.ids[list.nr++] = addr->id;
+
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
bool remove_subflow;
@@ -1103,7 +1118,7 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr);
mptcp_pm_remove_anno_addr(msk, addr, remove_subflow);
if (remove_subflow)
- mptcp_pm_remove_subflow(msk, addr->id);
+ mptcp_pm_remove_subflow(msk, &list);
release_sock(sk);
next:
@@ -1185,14 +1200,61 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
return ret;
}
-static void __flush_addrs(struct net *net, struct list_head *list)
+static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
+ struct list_head *rm_list)
+{
+ struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 };
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry(entry, rm_list, list) {
+ if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
+ alist.nr < MPTCP_RM_IDS_MAX &&
+ slist.nr < MPTCP_RM_IDS_MAX) {
+ alist.ids[alist.nr++] = entry->addr.id;
+ slist.ids[slist.nr++] = entry->addr.id;
+ } else if (remove_anno_list_by_saddr(msk, &entry->addr) &&
+ alist.nr < MPTCP_RM_IDS_MAX) {
+ alist.ids[alist.nr++] = entry->addr.id;
+ }
+ }
+
+ if (alist.nr) {
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_remove_addr(msk, &alist);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+ if (slist.nr)
+ mptcp_pm_remove_subflow(msk, &slist);
+}
+
+static void mptcp_nl_remove_addrs_list(struct net *net,
+ struct list_head *rm_list)
+{
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+
+ if (list_empty(rm_list))
+ return;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+
+ lock_sock(sk);
+ mptcp_pm_remove_addrs_and_subflows(msk, rm_list);
+ release_sock(sk);
+
+ sock_put(sk);
+ cond_resched();
+ }
+}
+
+static void __flush_addrs(struct list_head *list)
{
while (!list_empty(list)) {
struct mptcp_pm_addr_entry *cur;
cur = list_entry(list->next,
struct mptcp_pm_addr_entry, list);
- mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr);
list_del_rcu(&cur->list);
mptcp_pm_free_addr_entry(cur);
}
@@ -1217,7 +1279,8 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
pernet->next_id = 1;
bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1);
spin_unlock_bh(&pernet->lock);
- __flush_addrs(sock_net(skb->sk), &free_list);
+ mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list);
+ __flush_addrs(&free_list);
return 0;
}
@@ -1814,7 +1877,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list)
/* net is removed from namespace list, can't race with
* other modifiers
*/
- __flush_addrs(net, &pernet->local_addr_list);
+ __flush_addrs(&pernet->local_addr_list);
}
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e21a5bc36cf0..1111a99b024f 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -61,7 +61,7 @@
#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22
#define TCPOLEN_MPTCP_PORT_LEN 2
#define TCPOLEN_MPTCP_PORT_ALIGN 2
-#define TCPOLEN_MPTCP_RM_ADDR_BASE 4
+#define TCPOLEN_MPTCP_RM_ADDR_BASE 3
#define TCPOLEN_MPTCP_PRIO 3
#define TCPOLEN_MPTCP_PRIO_ALIGN 4
#define TCPOLEN_MPTCP_FASTCLOSE 12
@@ -142,7 +142,7 @@ struct mptcp_options_received {
mpc_map:1,
__unused:2;
u8 addr_id;
- u8 rm_id;
+ struct mptcp_rm_list rm_list;
union {
struct in_addr addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -207,7 +207,8 @@ struct mptcp_pm_data {
u8 local_addr_used;
u8 subflows;
u8 status;
- u8 rm_id;
+ struct mptcp_rm_list rm_list_tx;
+ struct mptcp_rm_list rm_list_rx;
};
struct mptcp_data_frag {
@@ -647,7 +648,8 @@ void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id);
void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
-void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id);
+void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list);
void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup);
int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
struct mptcp_addr_info *addr,
@@ -661,8 +663,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk,
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool echo, bool port);
-int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id);
-int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id);
+int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
const struct sock *ssk, gfp_t gfp);
@@ -709,16 +711,25 @@ static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port)
return len;
}
+static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list)
+{
+ if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX)
+ return -EINVAL;
+
+ return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1;
+}
+
bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_addr_info *saddr, bool *echo, bool *port);
bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
- u8 *rm_id);
+ struct mptcp_rm_list *rm_list);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
void __init mptcp_pm_nl_init(void);
void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
void mptcp_pm_nl_work(struct mptcp_sock *msk);
-void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id);
+void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk);
unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index db7479db8512..4f33307fa3cf 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -397,6 +397,7 @@ dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
msg = "not picking up existing connection ";
goto out_invalid;
}
+ break;
case CT_DCCP_REQUEST:
break;
case CT_DCCP_INVALID:
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index c77ba8690ed8..1bce1d2805c4 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -79,11 +79,8 @@ static int flow_offload_fill_route(struct flow_offload *flow,
enum flow_offload_tuple_dir dir)
{
struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
- struct dst_entry *other_dst = route->tuple[!dir].dst;
struct dst_entry *dst = route->tuple[dir].dst;
-
- if (!dst_hold_safe(route->tuple[dir].dst))
- return -1;
+ int i, j = 0;
switch (flow_tuple->l3proto) {
case NFPROTO_IPV4:
@@ -94,12 +91,46 @@ static int flow_offload_fill_route(struct flow_offload *flow,
break;
}
- flow_tuple->iifidx = other_dst->dev->ifindex;
- flow_tuple->dst_cache = dst;
+ flow_tuple->iifidx = route->tuple[dir].in.ifindex;
+ for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
+ flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
+ flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
+ if (route->tuple[dir].in.ingress_vlans & BIT(i))
+ flow_tuple->in_vlan_ingress |= BIT(j);
+ j++;
+ }
+ flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+
+ switch (route->tuple[dir].xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
+ ETH_ALEN);
+ memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
+ ETH_ALEN);
+ flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
+ flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex;
+ break;
+ case FLOW_OFFLOAD_XMIT_XFRM:
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ if (!dst_hold_safe(route->tuple[dir].dst))
+ return -1;
+
+ flow_tuple->dst_cache = dst;
+ break;
+ }
+ flow_tuple->xmit_type = route->tuple[dir].xmit_type;
return 0;
}
+static void nft_flow_dst_release(struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir)
+{
+ if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
+ dst_release(flow->tuplehash[dir].tuple.dst_cache);
+}
+
int flow_offload_route_init(struct flow_offload *flow,
const struct nf_flow_route *route)
{
@@ -118,7 +149,7 @@ int flow_offload_route_init(struct flow_offload *flow,
return 0;
err_route_reply:
- dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
return err;
}
@@ -169,8 +200,8 @@ static void flow_offload_fixup_ct(struct nf_conn *ct)
static void flow_offload_route_release(struct flow_offload *flow)
{
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
}
void flow_offload_free(struct flow_offload *flow)
@@ -389,29 +420,20 @@ static void nf_flow_offload_work_gc(struct work_struct *work)
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
-
-static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct tcphdr *tcph;
- if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
-
- return 0;
}
-static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct udphdr *udph;
- if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace2(&udph->check, skb, port,
@@ -419,37 +441,28 @@ static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
- u8 protocol, __be16 port, __be16 new_port)
+static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, __be16 port, __be16 new_port)
{
switch (protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_tcp(skb, thoff, port, new_port);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_udp(skb, thoff, port, new_port);
break;
}
-
- return 0;
}
-int nf_flow_snat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_snat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
- if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
@@ -463,24 +476,19 @@ int nf_flow_snat_port(const struct flow_offload *flow,
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
hdr->dest = new_port;
break;
- default:
- return -1;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_snat_port);
-int nf_flow_dnat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, u8 protocol,
+ enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
- if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
@@ -494,11 +502,9 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
hdr->source = new_port;
break;
- default:
- return -1;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index a698dbe28ef5..12cb0cc6958c 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -7,6 +7,9 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
@@ -34,28 +37,20 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
return 0;
}
-static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
+static void nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
{
struct tcphdr *tcph;
- if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
-
- return 0;
}
-static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
+static void nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
{
struct udphdr *udph;
- if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace4(&udph->check, skb, addr,
@@ -63,31 +58,25 @@ static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
- unsigned int thoff, __be32 addr,
- __be32 new_addr)
+static void nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, __be32 addr,
+ __be32 new_addr)
{
switch (iph->protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ip_udp(skb, thoff, addr, new_addr);
break;
}
-
- return 0;
}
-static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_snat_ip(const struct flow_offload *flow,
+ struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
{
__be32 addr, new_addr;
@@ -102,17 +91,15 @@ static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
iph->daddr = new_addr;
break;
- default:
- return -1;
}
csum_replace4(&iph->check, addr, new_addr);
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+ nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
}
-static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_dnat_ip(const struct flow_offload *flow,
+ struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
{
__be32 addr, new_addr;
@@ -127,31 +114,24 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
iph->saddr = new_addr;
break;
- default:
- return -1;
}
csum_replace4(&iph->check, addr, new_addr);
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+ nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
}
-static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- unsigned int thoff, enum flow_offload_tuple_dir dir)
+static void nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, enum flow_offload_tuple_dir dir,
+ struct iphdr *iph)
{
- struct iphdr *iph = ip_hdr(skb);
-
- if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
- (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0))
- return -1;
-
- iph = ip_hdr(skb);
- if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
- (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0))
- return -1;
-
- return 0;
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir);
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir);
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir);
+ }
}
static bool ip_has_options(unsigned int thoff)
@@ -159,29 +139,58 @@ static bool ip_has_options(unsigned int thoff)
return thoff != sizeof(struct iphdr);
}
+static void nf_flow_tuple_encap(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
+{
+ struct vlan_ethhdr *veth;
+ struct pppoe_hdr *phdr;
+ int i = 0;
+
+ if (skb_vlan_tag_present(skb)) {
+ tuple->encap[i].id = skb_vlan_tag_get(skb);
+ tuple->encap[i].proto = skb->vlan_proto;
+ i++;
+ }
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ tuple->encap[i].id = ntohs(veth->h_vlan_TCI);
+ tuple->encap[i].proto = skb->protocol;
+ break;
+ case htons(ETH_P_PPP_SES):
+ phdr = (struct pppoe_hdr *)skb_mac_header(skb);
+ tuple->encap[i].id = ntohs(phdr->sid);
+ tuple->encap[i].proto = skb->protocol;
+ break;
+ }
+}
+
static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
+ struct flow_offload_tuple *tuple, u32 *hdrsize,
+ u32 offset)
{
- unsigned int thoff, hdrsize;
struct flow_ports *ports;
+ unsigned int thoff;
struct iphdr *iph;
- if (!pskb_may_pull(skb, sizeof(*iph)))
+ if (!pskb_may_pull(skb, sizeof(*iph) + offset))
return -1;
- iph = ip_hdr(skb);
- thoff = iph->ihl * 4;
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ thoff = (iph->ihl * 4);
if (ip_is_fragment(iph) ||
unlikely(ip_has_options(thoff)))
return -1;
+ thoff += offset;
+
switch (iph->protocol) {
case IPPROTO_TCP:
- hdrsize = sizeof(struct tcphdr);
+ *hdrsize = sizeof(struct tcphdr);
break;
case IPPROTO_UDP:
- hdrsize = sizeof(struct udphdr);
+ *hdrsize = sizeof(struct udphdr);
break;
default:
return -1;
@@ -190,11 +199,10 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
if (iph->ttl <= 1)
return -1;
- thoff = iph->ihl * 4;
- if (!pskb_may_pull(skb, thoff + hdrsize))
+ if (!pskb_may_pull(skb, thoff + *hdrsize))
return -1;
- iph = ip_hdr(skb);
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
tuple->src_v4.s_addr = iph->saddr;
@@ -204,6 +212,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
tuple->l3proto = AF_INET;
tuple->l4proto = iph->protocol;
tuple->iifidx = dev->ifindex;
+ nf_flow_tuple_encap(skb, tuple);
return 0;
}
@@ -220,14 +229,6 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
return true;
}
-static int nf_flow_offload_dst_check(struct dst_entry *dst)
-{
- if (unlikely(dst_xfrm(dst)))
- return dst_check(dst, 0) ? 0 : -1;
-
- return 0;
-}
-
static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
const struct nf_hook_state *state,
struct dst_entry *dst)
@@ -238,6 +239,91 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
return NF_STOLEN;
}
+static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb)
+{
+ __be16 proto;
+
+ proto = *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+ sizeof(struct pppoe_hdr)));
+ switch (proto) {
+ case htons(PPP_IP):
+ return htons(ETH_P_IP);
+ case htons(PPP_IPV6):
+ return htons(ETH_P_IPV6);
+ }
+
+ return 0;
+}
+
+static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto,
+ u32 *offset)
+{
+ struct vlan_ethhdr *veth;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ if (veth->h_vlan_encapsulated_proto == proto) {
+ *offset += VLAN_HLEN;
+ return true;
+ }
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (nf_flow_pppoe_proto(skb) == proto) {
+ *offset += PPPOE_SES_HLEN;
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+static void nf_flow_encap_pop(struct sk_buff *skb,
+ struct flow_offload_tuple_rhash *tuplehash)
+{
+ struct vlan_hdr *vlan_hdr;
+ int i;
+
+ for (i = 0; i < tuplehash->tuple.encap_num; i++) {
+ if (skb_vlan_tag_present(skb)) {
+ __vlan_hwaccel_clear_tag(skb);
+ continue;
+ }
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ vlan_hdr = (struct vlan_hdr *)skb->data;
+ __skb_pull(skb, VLAN_HLEN);
+ vlan_set_encap_proto(skb, vlan_hdr);
+ skb_reset_network_header(skb);
+ break;
+ case htons(ETH_P_PPP_SES):
+ skb->protocol = nf_flow_pppoe_proto(skb);
+ skb_pull(skb, PPPOE_SES_HLEN);
+ skb_reset_network_header(skb);
+ break;
+ }
+ }
+}
+
+static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ const struct flow_offload_tuple_rhash *tuplehash,
+ unsigned short type)
+{
+ struct net_device *outdev;
+
+ outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx);
+ if (!outdev)
+ return NF_DROP;
+
+ skb->dev = outdev;
+ dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest,
+ tuplehash->tuple.out.h_source, skb->len);
+ dev_queue_xmit(skb);
+
+ return NF_STOLEN;
+}
+
unsigned int
nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -248,15 +334,18 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
enum flow_offload_tuple_dir dir;
struct flow_offload *flow;
struct net_device *outdev;
+ u32 hdrsize, offset = 0;
+ unsigned int thoff, mtu;
struct rtable *rt;
- unsigned int thoff;
struct iphdr *iph;
__be32 nexthop;
+ int ret;
- if (skb->protocol != htons(ETH_P_IP))
+ if (skb->protocol != htons(ETH_P_IP) &&
+ !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &offset))
return NF_ACCEPT;
- if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+ if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize, offset) < 0)
return NF_ACCEPT;
tuplehash = flow_offload_lookup(flow_table, &tuple);
@@ -265,77 +354,87 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
- outdev = rt->dst.dev;
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ mtu = flow->tuplehash[dir].tuple.mtu + offset;
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
return NF_ACCEPT;
- if (skb_try_make_writable(skb, sizeof(*iph)))
- return NF_DROP;
-
- thoff = ip_hdr(skb)->ihl * 4;
- if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ thoff = (iph->ihl * 4) + offset;
+ if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
return NF_ACCEPT;
- flow_offload_refresh(flow_table, flow);
-
- if (nf_flow_offload_dst_check(&rt->dst)) {
- flow_offload_teardown(flow);
- return NF_ACCEPT;
+ if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) {
+ rt = (struct rtable *)tuplehash->tuple.dst_cache;
+ if (!dst_check(&rt->dst, 0)) {
+ flow_offload_teardown(flow);
+ return NF_ACCEPT;
+ }
}
- if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
+ if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
+ flow_offload_refresh(flow_table, flow);
+
+ nf_flow_encap_pop(skb, tuplehash);
+ thoff -= offset;
+
iph = ip_hdr(skb);
+ nf_flow_nat_ip(flow, skb, thoff, dir, iph);
+
ip_decrease_ttl(iph);
skb->tstamp = 0;
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
- if (unlikely(dst_xfrm(&rt->dst))) {
+ if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
memset(skb->cb, 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->dev->ifindex;
IPCB(skb)->flags = IPSKB_FORWARDED;
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
- skb->dev = outdev;
- nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
- skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+ switch (tuplehash->tuple.xmit_type) {
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ outdev = rt->dst.dev;
+ skb->dev = outdev;
+ nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+ skb_dst_set_noref(skb, &rt->dst);
+ neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+ ret = NF_STOLEN;
+ break;
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP);
+ if (ret == NF_DROP)
+ flow_offload_teardown(flow);
+ break;
+ }
- return NF_STOLEN;
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
-static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr,
+ struct ipv6hdr *ip6h)
{
struct tcphdr *tcph;
- if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
new_addr->s6_addr32, true);
-
- return 0;
}
-static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
{
struct udphdr *udph;
- if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
@@ -343,32 +442,26 @@ static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff, struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff, struct in6_addr *addr,
+ struct in6_addr *new_addr)
{
switch (ip6h->nexthdr) {
case IPPROTO_TCP:
- if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr, ip6h);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr);
break;
}
-
- return 0;
}
-static int nf_flow_snat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_snat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
{
struct in6_addr addr, new_addr;
@@ -383,17 +476,15 @@ static int nf_flow_snat_ipv6(const struct flow_offload *flow,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
ip6h->daddr = new_addr;
break;
- default:
- return -1;
}
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+ nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
}
-static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_dnat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
{
struct in6_addr addr, new_addr;
@@ -408,52 +499,48 @@ static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
ip6h->saddr = new_addr;
break;
- default:
- return -1;
}
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+ nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
}
-static int nf_flow_nat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_nat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir,
+ struct ipv6hdr *ip6h)
{
- struct ipv6hdr *ip6h = ipv6_hdr(skb);
unsigned int thoff = sizeof(*ip6h);
- if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
- (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_snat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0))
- return -1;
-
- ip6h = ipv6_hdr(skb);
- if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
- (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_dnat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0))
- return -1;
-
- return 0;
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir);
+ nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir);
+ nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir);
+ }
}
static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
+ struct flow_offload_tuple *tuple, u32 *hdrsize,
+ u32 offset)
{
- unsigned int thoff, hdrsize;
struct flow_ports *ports;
struct ipv6hdr *ip6h;
+ unsigned int thoff;
- if (!pskb_may_pull(skb, sizeof(*ip6h)))
+ thoff = sizeof(*ip6h) + offset;
+ if (!pskb_may_pull(skb, thoff))
return -1;
- ip6h = ipv6_hdr(skb);
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
switch (ip6h->nexthdr) {
case IPPROTO_TCP:
- hdrsize = sizeof(struct tcphdr);
+ *hdrsize = sizeof(struct tcphdr);
break;
case IPPROTO_UDP:
- hdrsize = sizeof(struct udphdr);
+ *hdrsize = sizeof(struct udphdr);
break;
default:
return -1;
@@ -462,11 +549,10 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
if (ip6h->hop_limit <= 1)
return -1;
- thoff = sizeof(*ip6h);
- if (!pskb_may_pull(skb, thoff + hdrsize))
+ if (!pskb_may_pull(skb, thoff + *hdrsize))
return -1;
- ip6h = ipv6_hdr(skb);
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
tuple->src_v6 = ip6h->saddr;
@@ -476,6 +562,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
tuple->l3proto = AF_INET6;
tuple->l4proto = ip6h->nexthdr;
tuple->iifidx = dev->ifindex;
+ nf_flow_tuple_encap(skb, tuple);
return 0;
}
@@ -491,13 +578,17 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
const struct in6_addr *nexthop;
struct flow_offload *flow;
struct net_device *outdev;
+ unsigned int thoff, mtu;
+ u32 hdrsize, offset = 0;
struct ipv6hdr *ip6h;
struct rt6_info *rt;
+ int ret;
- if (skb->protocol != htons(ETH_P_IPV6))
+ if (skb->protocol != htons(ETH_P_IPV6) &&
+ !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &offset))
return NF_ACCEPT;
- if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
+ if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize, offset) < 0)
return NF_ACCEPT;
tuplehash = flow_offload_lookup(flow_table, &tuple);
@@ -506,48 +597,64 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
- outdev = rt->dst.dev;
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ mtu = flow->tuplehash[dir].tuple.mtu + offset;
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
return NF_ACCEPT;
- if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb,
- sizeof(*ip6h)))
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
+ thoff = sizeof(*ip6h) + offset;
+ if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff))
return NF_ACCEPT;
- flow_offload_refresh(flow_table, flow);
-
- if (nf_flow_offload_dst_check(&rt->dst)) {
- flow_offload_teardown(flow);
- return NF_ACCEPT;
+ if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) {
+ rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
+ if (!dst_check(&rt->dst, 0)) {
+ flow_offload_teardown(flow);
+ return NF_ACCEPT;
+ }
}
- if (skb_try_make_writable(skb, sizeof(*ip6h)))
+ if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
- if (nf_flow_nat_ipv6(flow, skb, dir) < 0)
- return NF_DROP;
+ flow_offload_refresh(flow_table, flow);
+
+ nf_flow_encap_pop(skb, tuplehash);
ip6h = ipv6_hdr(skb);
+ nf_flow_nat_ipv6(flow, skb, dir, ip6h);
+
ip6h->hop_limit--;
skb->tstamp = 0;
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
- if (unlikely(dst_xfrm(&rt->dst))) {
+ if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
IP6CB(skb)->iif = skb->dev->ifindex;
IP6CB(skb)->flags = IP6SKB_FORWARDED;
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
- skb->dev = outdev;
- nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
- skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+ switch (tuplehash->tuple.xmit_type) {
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ outdev = rt->dst.dev;
+ skb->dev = outdev;
+ nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+ skb_dst_set_noref(skb, &rt->dst);
+ neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+ ret = NF_STOLEN;
+ break;
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6);
+ if (ret == NF_DROP)
+ flow_offload_teardown(flow);
+ break;
+ }
- return NF_STOLEN;
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 2a6993fa40d7..7d0d128407be 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -13,7 +13,9 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_tuple.h>
-static struct workqueue_struct *nf_flow_offload_wq;
+static struct workqueue_struct *nf_flow_offload_add_wq;
+static struct workqueue_struct *nf_flow_offload_del_wq;
+static struct workqueue_struct *nf_flow_offload_stats_wq;
struct flow_offload_work {
struct list_head list;
@@ -175,28 +177,45 @@ static int flow_offload_eth_src(struct net *net,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple;
struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
- struct net_device *dev;
+ const struct flow_offload_tuple *other_tuple, *this_tuple;
+ struct net_device *dev = NULL;
+ const unsigned char *addr;
u32 mask, val;
u16 val16;
- dev = dev_get_by_index(net, tuple->iifidx);
- if (!dev)
- return -ENOENT;
+ this_tuple = &flow->tuplehash[dir].tuple;
+
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ addr = this_tuple->out.h_source;
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ dev = dev_get_by_index(net, other_tuple->iifidx);
+ if (!dev)
+ return -ENOENT;
+
+ addr = dev->dev_addr;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
mask = ~0xffff0000;
- memcpy(&val16, dev->dev_addr, 2);
+ memcpy(&val16, addr, 2);
val = val16 << 16;
flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
&val, &mask);
mask = ~0xffffffff;
- memcpy(&val, dev->dev_addr + 2, 4);
+ memcpy(&val, addr + 2, 4);
flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8,
&val, &mask);
- dev_put(dev);
+
+ if (dev)
+ dev_put(dev);
return 0;
}
@@ -208,27 +227,40 @@ static int flow_offload_eth_dst(struct net *net,
{
struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
- const void *daddr = &flow->tuplehash[!dir].tuple.src_v4;
+ const struct flow_offload_tuple *other_tuple, *this_tuple;
const struct dst_entry *dst_cache;
unsigned char ha[ETH_ALEN];
struct neighbour *n;
+ const void *daddr;
u32 mask, val;
u8 nud_state;
u16 val16;
- dst_cache = flow->tuplehash[dir].tuple.dst_cache;
- n = dst_neigh_lookup(dst_cache, daddr);
- if (!n)
- return -ENOENT;
+ this_tuple = &flow->tuplehash[dir].tuple;
- read_lock_bh(&n->lock);
- nud_state = n->nud_state;
- ether_addr_copy(ha, n->ha);
- read_unlock_bh(&n->lock);
-
- if (!(nud_state & NUD_VALID)) {
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ether_addr_copy(ha, this_tuple->out.h_dest);
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ daddr = &other_tuple->src_v4;
+ dst_cache = this_tuple->dst_cache;
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -ENOENT;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
neigh_release(n);
- return -ENOENT;
+
+ if (!(nud_state & NUD_VALID))
+ return -ENOENT;
+ break;
+ default:
+ return -EOPNOTSUPP;
}
mask = ~0xffffffff;
@@ -241,7 +273,6 @@ static int flow_offload_eth_dst(struct net *net,
val = val16;
flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
&val, &mask);
- neigh_release(n);
return 0;
}
@@ -463,27 +494,52 @@ static void flow_offload_ipv4_checksum(struct net *net,
}
}
-static void flow_offload_redirect(const struct flow_offload *flow,
+static void flow_offload_redirect(struct net *net,
+ const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
- struct rtable *rt;
+ const struct flow_offload_tuple *this_tuple, *other_tuple;
+ struct flow_action_entry *entry;
+ struct net_device *dev;
+ int ifindex;
+
+ this_tuple = &flow->tuplehash[dir].tuple;
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ this_tuple = &flow->tuplehash[dir].tuple;
+ ifindex = this_tuple->out.hw_ifidx;
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ifindex = other_tuple->iifidx;
+ break;
+ default:
+ return;
+ }
- rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return;
+
+ entry = flow_action_entry_next(flow_rule);
entry->id = FLOW_ACTION_REDIRECT;
- entry->dev = rt->dst.dev;
- dev_hold(rt->dst.dev);
+ entry->dev = dev;
}
static void flow_offload_encap_tunnel(const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
+ const struct flow_offload_tuple *this_tuple;
struct flow_action_entry *entry;
struct dst_entry *dst;
- dst = flow->tuplehash[dir].tuple.dst_cache;
+ this_tuple = &flow->tuplehash[dir].tuple;
+ if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
+ return;
+
+ dst = this_tuple->dst_cache;
if (dst && dst->lwtstate) {
struct ip_tunnel_info *tun_info;
@@ -500,10 +556,15 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
+ const struct flow_offload_tuple *other_tuple;
struct flow_action_entry *entry;
struct dst_entry *dst;
- dst = flow->tuplehash[!dir].tuple.dst_cache;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
+ return;
+
+ dst = other_tuple->dst_cache;
if (dst && dst->lwtstate) {
struct ip_tunnel_info *tun_info;
@@ -515,10 +576,14 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow,
}
}
-int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int
+nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
+ const struct flow_offload_tuple *other_tuple;
+ int i;
+
flow_offload_decap_tunnel(flow, dir, flow_rule);
flow_offload_encap_tunnel(flow, dir, flow_rule);
@@ -526,6 +591,39 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
return -1;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+
+ for (i = 0; i < other_tuple->encap_num; i++) {
+ struct flow_action_entry *entry;
+
+ if (other_tuple->in_vlan_ingress & BIT(i))
+ continue;
+
+ entry = flow_action_entry_next(flow_rule);
+
+ switch (other_tuple->encap[i].proto) {
+ case htons(ETH_P_PPP_SES):
+ entry->id = FLOW_ACTION_PPPOE_PUSH;
+ entry->pppoe.sid = other_tuple->encap[i].id;
+ break;
+ case htons(ETH_P_8021Q):
+ entry->id = FLOW_ACTION_VLAN_PUSH;
+ entry->vlan.vid = other_tuple->encap[i].id;
+ entry->vlan.proto = other_tuple->encap[i].proto;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0)
+ return -1;
+
if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
flow_offload_ipv4_snat(net, flow, dir, flow_rule);
flow_offload_port_snat(net, flow, dir, flow_rule);
@@ -538,7 +636,7 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
test_bit(NF_FLOW_DNAT, &flow->flags))
flow_offload_ipv4_checksum(net, flow, flow_rule);
- flow_offload_redirect(flow, dir, flow_rule);
+ flow_offload_redirect(net, flow, dir, flow_rule);
return 0;
}
@@ -548,11 +646,7 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- flow_offload_decap_tunnel(flow, dir, flow_rule);
- flow_offload_encap_tunnel(flow, dir, flow_rule);
-
- if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
- flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+ if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0)
return -1;
if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
@@ -564,7 +658,7 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
flow_offload_port_dnat(net, flow, dir, flow_rule);
}
- flow_offload_redirect(flow, dir, flow_rule);
+ flow_offload_redirect(net, flow, dir, flow_rule);
return 0;
}
@@ -578,10 +672,10 @@ nf_flow_offload_rule_alloc(struct net *net,
enum flow_offload_tuple_dir dir)
{
const struct nf_flowtable *flowtable = offload->flowtable;
+ const struct flow_offload_tuple *tuple, *other_tuple;
const struct flow_offload *flow = offload->flow;
- const struct flow_offload_tuple *tuple;
+ struct dst_entry *other_dst = NULL;
struct nf_flow_rule *flow_rule;
- struct dst_entry *other_dst;
int err = -ENOMEM;
flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL);
@@ -597,7 +691,10 @@ nf_flow_offload_rule_alloc(struct net *net,
flow_rule->rule->match.key = &flow_rule->match.key;
tuple = &flow->tuplehash[dir].tuple;
- other_dst = flow->tuplehash[!dir].tuple.dst_cache;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ other_dst = other_tuple->dst_cache;
+
err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst);
if (err < 0)
goto err_flow_match;
@@ -826,7 +923,12 @@ static void flow_offload_work_handler(struct work_struct *work)
static void flow_offload_queue_work(struct flow_offload_work *offload)
{
- queue_work(nf_flow_offload_wq, &offload->work);
+ if (offload->cmd == FLOW_CLS_REPLACE)
+ queue_work(nf_flow_offload_add_wq, &offload->work);
+ else if (offload->cmd == FLOW_CLS_DESTROY)
+ queue_work(nf_flow_offload_del_wq, &offload->work);
+ else
+ queue_work(nf_flow_offload_stats_wq, &offload->work);
}
static struct flow_offload_work *
@@ -898,8 +1000,11 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable,
void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
{
- if (nf_flowtable_hw_offload(flowtable))
- flush_workqueue(nf_flow_offload_wq);
+ if (nf_flowtable_hw_offload(flowtable)) {
+ flush_workqueue(nf_flow_offload_add_wq);
+ flush_workqueue(nf_flow_offload_del_wq);
+ flush_workqueue(nf_flow_offload_stats_wq);
+ }
}
static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
@@ -1011,15 +1116,33 @@ EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup);
int nf_flow_table_offload_init(void)
{
- nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload",
- WQ_UNBOUND, 0);
- if (!nf_flow_offload_wq)
+ nf_flow_offload_add_wq = alloc_workqueue("nf_ft_offload_add",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_add_wq)
return -ENOMEM;
+ nf_flow_offload_del_wq = alloc_workqueue("nf_ft_offload_del",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_del_wq)
+ goto err_del_wq;
+
+ nf_flow_offload_stats_wq = alloc_workqueue("nf_ft_offload_stats",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_stats_wq)
+ goto err_stats_wq;
+
return 0;
+
+err_stats_wq:
+ destroy_workqueue(nf_flow_offload_del_wq);
+err_del_wq:
+ destroy_workqueue(nf_flow_offload_add_wq);
+ return -ENOMEM;
}
void nf_flow_table_offload_exit(void)
{
- destroy_workqueue(nf_flow_offload_wq);
+ destroy_workqueue(nf_flow_offload_add_wq);
+ destroy_workqueue(nf_flow_offload_del_wq);
+ destroy_workqueue(nf_flow_offload_stats_wq);
}
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f57f1a6ba96f..fc2526b8bd55 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -900,6 +900,12 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table)
nft_table_disable(net, table, 0);
}
+enum {
+ NFT_TABLE_STATE_UNCHANGED = 0,
+ NFT_TABLE_STATE_DORMANT,
+ NFT_TABLE_STATE_WAKEUP
+};
+
static int nf_tables_updtable(struct nft_ctx *ctx)
{
struct nft_trans *trans;
@@ -929,19 +935,17 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
if ((flags & NFT_TABLE_F_DORMANT) &&
!(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
- nft_trans_table_enable(trans) = false;
+ nft_trans_table_state(trans) = NFT_TABLE_STATE_DORMANT;
} else if (!(flags & NFT_TABLE_F_DORMANT) &&
ctx->table->flags & NFT_TABLE_F_DORMANT) {
- ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
ret = nf_tables_table_enable(ctx->net, ctx->table);
if (ret >= 0)
- nft_trans_table_enable(trans) = true;
- else
- ctx->table->flags |= NFT_TABLE_F_DORMANT;
+ nft_trans_table_state(trans) = NFT_TABLE_STATE_WAKEUP;
}
if (ret < 0)
goto err;
+ nft_trans_table_flags(trans) = flags;
nft_trans_table_update(trans) = true;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
@@ -8086,11 +8090,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (!nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
- }
+ if (nft_trans_table_state(trans) == NFT_TABLE_STATE_DORMANT)
+ nf_tables_table_disable(net, trans->ctx.table);
+
+ trans->ctx.table->flags = nft_trans_table_flags(trans);
} else {
nft_clear(net, trans->ctx.table);
}
@@ -8303,11 +8306,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
- }
+ if (nft_trans_table_state(trans) == NFT_TABLE_STATE_WAKEUP)
+ nf_tables_table_disable(net, trans->ctx.table);
+
nft_trans_destroy(trans);
} else {
list_del_rcu(&trans->ctx.table->list);
@@ -8577,6 +8578,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
data->verdict.chain);
if (err < 0)
return err;
+ break;
default:
break;
}
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 882fe8648653..0592a9456084 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -527,6 +527,7 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
case NFT_CT_ZONE:
if (--nft_ct_pcpu_template_refcnt == 0)
nft_ct_tmpl_put_pcpu();
+ break;
#endif
default:
break;
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 3a6c84fb2c90..4843dd2b410c 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -19,10 +19,205 @@ struct nft_flow_offload {
struct nft_flowtable *flowtable;
};
+static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst)
+{
+ if (dst_xfrm(dst))
+ return FLOW_OFFLOAD_XMIT_XFRM;
+
+ return FLOW_OFFLOAD_XMIT_NEIGH;
+}
+
+static void nft_default_forward_path(struct nf_flow_route *route,
+ struct dst_entry *dst_cache,
+ enum ip_conntrack_dir dir)
+{
+ route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex;
+ route->tuple[dir].dst = dst_cache;
+ route->tuple[dir].xmit_type = nft_xmit_type(dst_cache);
+}
+
+static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
+ const struct dst_entry *dst_cache,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir, u8 *ha,
+ struct net_device_path_stack *stack)
+{
+ const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
+ struct net_device *dev = dst_cache->dev;
+ struct neighbour *n;
+ u8 nud_state;
+
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -1;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+
+ if (!(nud_state & NUD_VALID))
+ return -1;
+
+ return dev_fill_forward_path(dev, ha, stack);
+}
+
+struct nft_forward_info {
+ const struct net_device *indev;
+ const struct net_device *outdev;
+ const struct net_device *hw_outdev;
+ struct id {
+ __u16 id;
+ __be16 proto;
+ } encap[NF_FLOW_TABLE_ENCAP_MAX];
+ u8 num_encaps;
+ u8 ingress_vlans;
+ u8 h_source[ETH_ALEN];
+ u8 h_dest[ETH_ALEN];
+ enum flow_offload_xmit_type xmit_type;
+};
+
+static bool nft_is_valid_ether_device(const struct net_device *dev)
+{
+ if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
+ return false;
+
+ return true;
+}
+
+static void nft_dev_path_info(const struct net_device_path_stack *stack,
+ struct nft_forward_info *info,
+ unsigned char *ha, struct nf_flowtable *flowtable)
+{
+ const struct net_device_path *path;
+ int i;
+
+ memcpy(info->h_dest, ha, ETH_ALEN);
+
+ for (i = 0; i < stack->num_paths; i++) {
+ path = &stack->path[i];
+ switch (path->type) {
+ case DEV_PATH_ETHERNET:
+ case DEV_PATH_DSA:
+ case DEV_PATH_VLAN:
+ case DEV_PATH_PPPOE:
+ info->indev = path->dev;
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ if (path->type == DEV_PATH_ETHERNET)
+ break;
+ if (path->type == DEV_PATH_DSA) {
+ i = stack->num_paths;
+ break;
+ }
+
+ /* DEV_PATH_VLAN and DEV_PATH_PPPOE */
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ info->outdev = path->dev;
+ info->encap[info->num_encaps].id = path->encap.id;
+ info->encap[info->num_encaps].proto = path->encap.proto;
+ info->num_encaps++;
+ if (path->type == DEV_PATH_PPPOE)
+ memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+ break;
+ case DEV_PATH_BRIDGE:
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ info->ingress_vlans |= BIT(info->num_encaps - 1);
+ break;
+ case DEV_PATH_BR_VLAN_TAG:
+ info->encap[info->num_encaps].id = path->bridge.vlan_id;
+ info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
+ info->num_encaps++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG:
+ info->num_encaps--;
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+ break;
+ default:
+ info->indev = NULL;
+ break;
+ }
+ }
+ if (!info->outdev)
+ info->outdev = info->indev;
+
+ info->hw_outdev = info->indev;
+
+ if (nf_flowtable_hw_offload(flowtable) &&
+ nft_is_valid_ether_device(info->indev))
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+}
+
+static bool nft_flowtable_find_dev(const struct net_device *dev,
+ struct nft_flowtable *ft)
+{
+ struct nft_hook *hook;
+ bool found = false;
+
+ list_for_each_entry_rcu(hook, &ft->hook_list, list) {
+ if (hook->ops.dev != dev)
+ continue;
+
+ found = true;
+ break;
+ }
+
+ return found;
+}
+
+static void nft_dev_forward_path(struct nf_flow_route *route,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
+{
+ const struct dst_entry *dst = route->tuple[dir].dst;
+ struct net_device_path_stack stack;
+ struct nft_forward_info info = {};
+ unsigned char ha[ETH_ALEN];
+ int i;
+
+ if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
+ nft_dev_path_info(&stack, &info, ha, &ft->data);
+
+ if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
+ return;
+
+ route->tuple[!dir].in.ifindex = info.indev->ifindex;
+ for (i = 0; i < info.num_encaps; i++) {
+ route->tuple[!dir].in.encap[i].id = info.encap[i].id;
+ route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
+ }
+ route->tuple[!dir].in.num_encaps = info.num_encaps;
+ route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
+
+ if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
+ memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
+ memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
+ route->tuple[dir].out.ifindex = info.outdev->ifindex;
+ route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
+ route->tuple[dir].xmit_type = info.xmit_type;
+ }
+}
+
static int nft_flow_route(const struct nft_pktinfo *pkt,
const struct nf_conn *ct,
struct nf_flow_route *route,
- enum ip_conntrack_dir dir)
+ enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
{
struct dst_entry *this_dst = skb_dst(pkt->skb);
struct dst_entry *other_dst = NULL;
@@ -44,8 +239,14 @@ static int nft_flow_route(const struct nft_pktinfo *pkt,
if (!other_dst)
return -ENOENT;
- route->tuple[dir].dst = this_dst;
- route->tuple[!dir].dst = other_dst;
+ nft_default_forward_path(route, this_dst, dir);
+ nft_default_forward_path(route, other_dst, !dir);
+
+ if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH &&
+ route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
+ nft_dev_forward_path(route, ct, dir, ft);
+ nft_dev_forward_path(route, ct, !dir, ft);
+ }
return 0;
}
@@ -74,8 +275,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
struct nft_flow_offload *priv = nft_expr_priv(expr);
struct nf_flowtable *flowtable = &priv->flowtable->data;
struct tcphdr _tcph, *tcph = NULL;
+ struct nf_flow_route route = {};
enum ip_conntrack_info ctinfo;
- struct nf_flow_route route;
struct flow_offload *flow;
enum ip_conntrack_dir dir;
struct nf_conn *ct;
@@ -112,7 +313,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
goto out;
dir = CTINFO2DIR(ctinfo);
- if (nft_flow_route(pkt, ct, &route, dir) < 0)
+ if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0)
goto err_flow_route;
flow = flow_offload_alloc(ct);
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 4ed7e52c7012..88deb5b41429 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -497,10 +497,12 @@ void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
if (unlikely(packet_length(skb, vport->dev) > mtu &&
!skb_is_gso(skb))) {
- net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
- vport->dev->name,
- packet_length(skb, vport->dev), mtu);
vport->dev->stats.tx_errors++;
+ if (vport->dev->flags & IFF_UP)
+ net_warn_ratelimited("%s: dropped over-mtu packet: "
+ "%d > %d\n", vport->dev->name,
+ packet_length(skb, vport->dev),
+ mtu);
goto drop;
}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 1eb7495ac5b4..8a930ca6d6b1 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -20,7 +20,7 @@
struct vport;
struct vport_parms;
-/* The following definitions are for users of the vport subsytem: */
+/* The following definitions are for users of the vport subsystem: */
int ovs_vport_init(void);
void ovs_vport_exit(void);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e24b2841c643..118d585337d7 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2057,7 +2057,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
* and skb->cb are mangled. It works because (and until) packets
* falling here are owned by current CPU. Output packets are cloned
* by dev_queue_xmit_nit(), input packets are processed by net_bh
- * sequencially, so that if we return skb to original state on exit,
+ * sequentially, so that if we return skb to original state on exit,
* we will not harm anyone.
*/
diff --git a/net/psample/psample.c b/net/psample/psample.c
index 482c07f2766b..118d5d2a81a0 100644
--- a/net/psample/psample.c
+++ b/net/psample/psample.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/module.h>
+#include <linux/timekeeping.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -356,9 +357,12 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info)
#endif
void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
- u32 trunc_size, int in_ifindex, int out_ifindex,
- u32 sample_rate)
+ u32 sample_rate, const struct psample_metadata *md)
{
+ ktime_t tstamp = ktime_get_real();
+ int out_ifindex = md->out_ifindex;
+ int in_ifindex = md->in_ifindex;
+ u32 trunc_size = md->trunc_size;
#ifdef CONFIG_INET
struct ip_tunnel_info *tun_info;
#endif
@@ -370,10 +374,15 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) +
(out_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+ (md->out_tc_valid ? nla_total_size(sizeof(u16)) : 0) +
+ (md->out_tc_occ_valid ? nla_total_size_64bit(sizeof(u64)) : 0) +
+ (md->latency_valid ? nla_total_size_64bit(sizeof(u64)) : 0) +
nla_total_size(sizeof(u32)) + /* sample_rate */
nla_total_size(sizeof(u32)) + /* orig_size */
nla_total_size(sizeof(u32)) + /* group_num */
- nla_total_size(sizeof(u32)); /* seq */
+ nla_total_size(sizeof(u32)) + /* seq */
+ nla_total_size_64bit(sizeof(u64)) + /* timestamp */
+ nla_total_size(sizeof(u16)); /* protocol */
#ifdef CONFIG_INET
tun_info = skb_tunnel_info(skb);
@@ -423,6 +432,36 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
if (unlikely(ret < 0))
goto error;
+ if (md->out_tc_valid) {
+ ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OUT_TC, md->out_tc);
+ if (unlikely(ret < 0))
+ goto error;
+ }
+
+ if (md->out_tc_occ_valid) {
+ ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_OUT_TC_OCC,
+ md->out_tc_occ, PSAMPLE_ATTR_PAD);
+ if (unlikely(ret < 0))
+ goto error;
+ }
+
+ if (md->latency_valid) {
+ ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_LATENCY,
+ md->latency, PSAMPLE_ATTR_PAD);
+ if (unlikely(ret < 0))
+ goto error;
+ }
+
+ ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_TIMESTAMP,
+ ktime_to_ns(tstamp), PSAMPLE_ATTR_PAD);
+ if (unlikely(ret < 0))
+ goto error;
+
+ ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_PROTO,
+ be16_to_cpu(skb->protocol));
+ if (unlikely(ret < 0))
+ goto error;
+
if (data_len) {
int nla_len = nla_total_size(data_len);
struct nlattr *nla;
diff --git a/net/rds/recv.c b/net/rds/recv.c
index aba4afe4dfed..4db109fb6ec2 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -722,8 +722,6 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
if (msg->msg_name) {
if (ipv6_addr_v4mapped(&inc->i_saddr)) {
- sin = (struct sockaddr_in *)msg->msg_name;
-
sin->sin_family = AF_INET;
sin->sin_port = inc->i_hdr.h_sport;
sin->sin_addr.s_addr =
@@ -731,8 +729,6 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
msg->msg_namelen = sizeof(*sin);
} else {
- sin6 = (struct sockaddr_in6 *)msg->msg_name;
-
sin6->sin6_family = AF_INET6;
sin6->sin6_port = inc->i_hdr.h_sport;
sin6->sin6_addr = inc->i_saddr;
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 6e35703ff353..c0e04c261a15 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -347,6 +347,7 @@ static int rose_del_node(struct rose_route_struct *rose_route,
case 1:
rose_node->neighbour[1] =
rose_node->neighbour[2];
+ break;
case 2:
break;
}
@@ -508,6 +509,7 @@ void rose_rt_device_down(struct net_device *dev)
fallthrough;
case 1:
t->neighbour[1] = t->neighbour[2];
+ break;
case 2:
break;
}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 8d8452b1cdd4..0fab8de176d2 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -42,6 +42,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
[TCA_POLICE_RATE64] = { .type = NLA_U64 },
[TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
+ [TCA_POLICE_PKTRATE64] = { .type = NLA_U64, .min = 1 },
+ [TCA_POLICE_PKTBURST64] = { .type = NLA_U64, .min = 1 },
};
static int tcf_police_init(struct net *net, struct nlattr *nla,
@@ -61,6 +63,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
bool exists = false;
u32 index;
u64 rate64, prate64;
+ u64 pps, ppsburst;
if (nla == NULL)
return -EINVAL;
@@ -142,6 +145,21 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
}
}
+ if ((tb[TCA_POLICE_PKTRATE64] && !tb[TCA_POLICE_PKTBURST64]) ||
+ (!tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64])) {
+ NL_SET_ERR_MSG(extack,
+ "Both or neither packet-per-second burst and rate must be provided");
+ err = -EINVAL;
+ goto failure;
+ }
+
+ if (tb[TCA_POLICE_PKTRATE64] && R_tab) {
+ NL_SET_ERR_MSG(extack,
+ "packet-per-second and byte-per-second rate limits not allowed in same action");
+ err = -EINVAL;
+ goto failure;
+ }
+
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (unlikely(!new)) {
err = -ENOMEM;
@@ -183,6 +201,14 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
if (tb[TCA_POLICE_AVRATE])
new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
+ if (tb[TCA_POLICE_PKTRATE64]) {
+ pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]);
+ ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]);
+ new->pps_present = true;
+ new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst);
+ psched_ppscfg_precompute(&new->ppsrate, pps);
+ }
+
spin_lock_bh(&police->tcf_lock);
spin_lock_bh(&police->tcfp_lock);
police->tcfp_t_c = ktime_get_ns();
@@ -217,8 +243,8 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_police *police = to_police(a);
+ s64 now, toks, ppstoks = 0, ptoks = 0;
struct tcf_police_params *p;
- s64 now, toks, ptoks = 0;
int ret;
tcf_lastuse_update(&police->tcf_tm);
@@ -236,7 +262,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
}
if (qdisc_pkt_len(skb) <= p->tcfp_mtu) {
- if (!p->rate_present) {
+ if (!p->rate_present && !p->pps_present) {
ret = p->tcfp_result;
goto end;
}
@@ -251,14 +277,23 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
ptoks -= (s64)psched_l2t_ns(&p->peak,
qdisc_pkt_len(skb));
}
- toks += police->tcfp_toks;
- if (toks > p->tcfp_burst)
- toks = p->tcfp_burst;
- toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
- if ((toks|ptoks) >= 0) {
+ if (p->rate_present) {
+ toks += police->tcfp_toks;
+ if (toks > p->tcfp_burst)
+ toks = p->tcfp_burst;
+ toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
+ } else if (p->pps_present) {
+ ppstoks = min_t(s64, now - police->tcfp_t_c, p->tcfp_pkt_burst);
+ ppstoks += police->tcfp_pkttoks;
+ if (ppstoks > p->tcfp_pkt_burst)
+ ppstoks = p->tcfp_pkt_burst;
+ ppstoks -= (s64)psched_pkt2t_ns(&p->ppsrate, 1);
+ }
+ if ((toks | ptoks | ppstoks) >= 0) {
police->tcfp_t_c = now;
police->tcfp_toks = toks;
police->tcfp_ptoks = ptoks;
+ police->tcfp_pkttoks = ppstoks;
spin_unlock_bh(&police->tcfp_lock);
ret = p->tcfp_result;
goto inc_drops;
@@ -331,6 +366,16 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
TCA_POLICE_PAD))
goto nla_put_failure;
}
+ if (p->pps_present) {
+ if (nla_put_u64_64bit(skb, TCA_POLICE_PKTRATE64,
+ police->params->ppsrate.rate_pkts_ps,
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(skb, TCA_POLICE_PKTBURST64,
+ PSCHED_NS2TICKS(p->tcfp_pkt_burst),
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ }
if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
goto nla_put_failure;
if (p->tcfp_result &&
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 3ebf9ede3cf1..6a0c16e4351d 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -158,10 +158,8 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
{
struct tcf_sample *s = to_sample(a);
struct psample_group *psample_group;
+ struct psample_metadata md = {};
int retval;
- int size;
- int iif;
- int oif;
tcf_lastuse_update(&s->tcf_tm);
bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
@@ -172,20 +170,18 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
/* randomly sample packets according to rate */
if (psample_group && (prandom_u32() % s->rate == 0)) {
if (!skb_at_tc_ingress(skb)) {
- iif = skb->skb_iif;
- oif = skb->dev->ifindex;
+ md.in_ifindex = skb->skb_iif;
+ md.out_ifindex = skb->dev->ifindex;
} else {
- iif = skb->dev->ifindex;
- oif = 0;
+ md.in_ifindex = skb->dev->ifindex;
}
/* on ingress, the mac header gets popped, so push it back */
if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
skb_push(skb, skb->mac_len);
- size = s->truncate ? s->trunc_size : skb->len;
- psample_sample_packet(psample_group, skb, size, iif, oif,
- s->rate);
+ md.trunc_size = s->truncate ? s->trunc_size : skb->len;
+ psample_sample_packet(psample_group, skb, s->rate, &md);
if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
skb_pull(skb, skb->mac_len);
@@ -194,6 +190,16 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
return retval;
}
+static void tcf_sample_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
+{
+ struct tcf_sample *s = to_sample(a);
+ struct tcf_t *tm = &s->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
@@ -280,6 +286,7 @@ static struct tc_action_ops act_sample_ops = {
.id = TCA_ID_SAMPLE,
.owner = THIS_MODULE,
.act = tcf_sample_act,
+ .stats_update = tcf_sample_stats_update,
.dump = tcf_sample_dump,
.init = tcf_sample_init,
.cleanup = tcf_sample_cleanup,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 13341e7fb077..d3db70865d66 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3662,6 +3662,9 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->police.burst = tcf_police_burst(act);
entry->police.rate_bytes_ps =
tcf_police_rate_bytes_ps(act);
+ entry->police.burst_pkt = tcf_police_burst_pkt(act);
+ entry->police.rate_pkt_ps =
+ tcf_police_rate_pkt_ps(act);
entry->police.mtu = tcf_police_tcfp_mtu(act);
entry->police.index = act->tcfa_index;
} else if (is_tcf_ct(act)) {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c69a4ba9c33f..d7869a984881 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -209,16 +209,16 @@ static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
struct fl_flow_key *key,
struct fl_flow_key *mkey)
{
- __be16 min_mask, max_mask, min_val, max_val;
+ u16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_range.tp_min.dst);
- max_mask = htons(filter->mask->key.tp_range.tp_max.dst);
- min_val = htons(filter->key.tp_range.tp_min.dst);
- max_val = htons(filter->key.tp_range.tp_max.dst);
+ min_mask = ntohs(filter->mask->key.tp_range.tp_min.dst);
+ max_mask = ntohs(filter->mask->key.tp_range.tp_max.dst);
+ min_val = ntohs(filter->key.tp_range.tp_min.dst);
+ max_val = ntohs(filter->key.tp_range.tp_max.dst);
if (min_mask && max_mask) {
- if (htons(key->tp_range.tp.dst) < min_val ||
- htons(key->tp_range.tp.dst) > max_val)
+ if (ntohs(key->tp_range.tp.dst) < min_val ||
+ ntohs(key->tp_range.tp.dst) > max_val)
return false;
/* skb does not have min and max values */
@@ -232,16 +232,16 @@ static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
struct fl_flow_key *key,
struct fl_flow_key *mkey)
{
- __be16 min_mask, max_mask, min_val, max_val;
+ u16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_range.tp_min.src);
- max_mask = htons(filter->mask->key.tp_range.tp_max.src);
- min_val = htons(filter->key.tp_range.tp_min.src);
- max_val = htons(filter->key.tp_range.tp_max.src);
+ min_mask = ntohs(filter->mask->key.tp_range.tp_min.src);
+ max_mask = ntohs(filter->mask->key.tp_range.tp_max.src);
+ min_val = ntohs(filter->key.tp_range.tp_min.src);
+ max_val = ntohs(filter->key.tp_range.tp_max.src);
if (min_mask && max_mask) {
- if (htons(key->tp_range.tp.src) < min_val ||
- htons(key->tp_range.tp.src) > max_val)
+ if (ntohs(key->tp_range.tp.src) < min_val ||
+ ntohs(key->tp_range.tp.src) > max_val)
return false;
/* skb does not have min and max values */
@@ -783,16 +783,16 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src));
if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
- htons(key->tp_range.tp_max.dst) <=
- htons(key->tp_range.tp_min.dst)) {
+ ntohs(key->tp_range.tp_max.dst) <=
+ ntohs(key->tp_range.tp_min.dst)) {
NL_SET_ERR_MSG_ATTR(extack,
tb[TCA_FLOWER_KEY_PORT_DST_MIN],
"Invalid destination port range (min must be strictly smaller than max)");
return -EINVAL;
}
if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
- htons(key->tp_range.tp_max.src) <=
- htons(key->tp_range.tp_min.src)) {
+ ntohs(key->tp_range.tp_max.src) <=
+ ntohs(key->tp_range.tp_min.src)) {
NL_SET_ERR_MSG_ATTR(extack,
tb[TCA_FLOWER_KEY_PORT_SRC_MIN],
"Invalid source port range (min must be strictly smaller than max)");
@@ -1044,8 +1044,8 @@ static int fl_set_key_flags(struct nlattr **tb, u32 *flags_key,
return -EINVAL;
}
- key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
- mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
+ key = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS]));
+ mask = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
*flags_key = 0;
*flags_mask = 0;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 320b3d31fa97..b79a7e27bb31 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -263,7 +263,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
/*
* Step 3+n. If classifier selected a link sharing class,
* apply agency specific classifier.
- * Repeat this procdure until we hit a leaf node.
+ * Repeat this procedure until we hit a leaf node.
*/
head = cl;
}
@@ -859,7 +859,7 @@ cbq_dequeue(struct Qdisc *sch)
return NULL;
}
-/* CBQ class maintanance routines */
+/* CBQ class maintenance routines */
static void cbq_adjust_levels(struct cbq_class *this)
{
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 49eae93d1489..44991ea726fc 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1325,6 +1325,48 @@ void dev_shutdown(struct net_device *dev)
WARN_ON(timer_pending(&dev->watchdog_timer));
}
+/**
+ * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division
+ * @rate: Rate to compute reciprocal division values of
+ * @mult: Multiplier for reciprocal division
+ * @shift: Shift for reciprocal division
+ *
+ * The multiplier and shift for reciprocal division by rate are stored
+ * in mult and shift.
+ *
+ * The deal here is to replace a divide by a reciprocal one
+ * in fast path (a reciprocal divide is a multiply and a shift)
+ *
+ * Normal formula would be :
+ * time_in_ns = (NSEC_PER_SEC * len) / rate_bps
+ *
+ * We compute mult/shift to use instead :
+ * time_in_ns = (len * mult) >> shift;
+ *
+ * We try to get the highest possible mult value for accuracy,
+ * but have to make sure no overflows will ever happen.
+ *
+ * reciprocal_value() is not used here it doesn't handle 64-bit values.
+ */
+static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift)
+{
+ u64 factor = NSEC_PER_SEC;
+
+ *mult = 1;
+ *shift = 0;
+
+ if (rate <= 0)
+ return;
+
+ for (;;) {
+ *mult = div64_u64(factor, rate);
+ if (*mult & (1U << 31) || factor & (1ULL << 63))
+ break;
+ factor <<= 1;
+ (*shift)++;
+ }
+}
+
void psched_ratecfg_precompute(struct psched_ratecfg *r,
const struct tc_ratespec *conf,
u64 rate64)
@@ -1333,34 +1375,17 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r,
r->overhead = conf->overhead;
r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
- r->mult = 1;
- /*
- * The deal here is to replace a divide by a reciprocal one
- * in fast path (a reciprocal divide is a multiply and a shift)
- *
- * Normal formula would be :
- * time_in_ns = (NSEC_PER_SEC * len) / rate_bps
- *
- * We compute mult/shift to use instead :
- * time_in_ns = (len * mult) >> shift;
- *
- * We try to get the highest possible mult value for accuracy,
- * but have to make sure no overflows will ever happen.
- */
- if (r->rate_bytes_ps > 0) {
- u64 factor = NSEC_PER_SEC;
-
- for (;;) {
- r->mult = div64_u64(factor, r->rate_bytes_ps);
- if (r->mult & (1U << 31) || factor & (1ULL << 63))
- break;
- factor <<= 1;
- r->shift++;
- }
- }
+ psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift);
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
+void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64)
+{
+ r->rate_pkts_ps = pktrate64;
+ psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift);
+}
+EXPORT_SYMBOL(psched_ppscfg_precompute);
+
static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 8287894541e3..922ed6b91abb 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -411,18 +411,10 @@ done:
return txtime;
}
-static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
- struct sk_buff **to_free)
+static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child, struct sk_buff **to_free)
{
struct taprio_sched *q = qdisc_priv(sch);
- struct Qdisc *child;
- int queue;
-
- queue = skb_get_queue_mapping(skb);
-
- child = q->qdiscs[queue];
- if (unlikely(!child))
- return qdisc_drop(skb, sch, to_free);
if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) {
if (!is_valid_interval(skb, sch))
@@ -439,6 +431,58 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return qdisc_enqueue(skb, child, to_free);
}
+static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct Qdisc *child;
+ int queue;
+
+ queue = skb_get_queue_mapping(skb);
+
+ child = q->qdiscs[queue];
+ if (unlikely(!child))
+ return qdisc_drop(skb, sch, to_free);
+
+ /* Large packets might not be transmitted when the transmission duration
+ * exceeds any configured interval. Therefore, segment the skb into
+ * smaller chunks. Skip it for the full offload case, as the driver
+ * and/or the hardware is expected to handle this.
+ */
+ if (skb_is_gso(skb) && !FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
+ netdev_features_t features = netif_skb_features(skb);
+ struct sk_buff *segs, *nskb;
+ int ret;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_drop(skb, sch, to_free);
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ skb_mark_not_on_list(segs);
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ slen += segs->len;
+
+ ret = taprio_enqueue_one(segs, sch, child, to_free);
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ } else {
+ numsegs++;
+ }
+ }
+
+ if (numsegs > 1)
+ qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
+ consume_skb(skb);
+
+ return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+ }
+
+ return taprio_enqueue_one(skb, sch, child, to_free);
+}
+
static struct sk_buff *taprio_peek_soft(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index abe29d1aa23a..fd0796269eed 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -3,6 +3,7 @@
*
* Copyright (c) 2000-2006, 2018, Ericsson AB
* Copyright (c) 2004-2005, 2010-2011, Wind River Systems
+ * Copyright (c) 2020-2021, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 1a11831bef62..0772cfadaa0d 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -3,7 +3,7 @@
*
* Copyright (c) 2000-2006, 2018, Ericsson AB
* Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2020, Red Hat Inc
+ * Copyright (c) 2020-2021, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -44,6 +44,50 @@
#include <net/netns/generic.h>
#include "core.h"
+/* Struct tipc_uaddr: internal version of struct sockaddr_tipc.
+ * Must be kept aligned both regarding field positions and size.
+ */
+struct tipc_uaddr {
+ unsigned short family;
+ unsigned char addrtype;
+ signed char scope;
+ union {
+ struct {
+ struct tipc_service_addr sa;
+ u32 lookup_node;
+ };
+ struct tipc_service_range sr;
+ struct tipc_socket_addr sk;
+ };
+};
+
+static inline void tipc_uaddr(struct tipc_uaddr *ua, u32 atype, u32 scope,
+ u32 type, u32 lower, u32 upper)
+{
+ ua->family = AF_TIPC;
+ ua->addrtype = atype;
+ ua->scope = scope;
+ ua->sr.type = type;
+ ua->sr.lower = lower;
+ ua->sr.upper = upper;
+}
+
+static inline bool tipc_uaddr_valid(struct tipc_uaddr *ua, int len)
+{
+ u32 atype;
+
+ if (len < sizeof(struct sockaddr_tipc))
+ return false;
+ atype = ua->addrtype;
+ if (ua->family != AF_TIPC)
+ return false;
+ if (atype == TIPC_SERVICE_ADDR || atype == TIPC_SOCKET_ADDR)
+ return true;
+ if (atype == TIPC_SERVICE_RANGE)
+ return ua->sr.upper >= ua->sr.lower;
+ return false;
+}
+
static inline u32 tipc_own_addr(struct net *net)
{
return tipc_net(net)->node_addr;
diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index f4fca8f7f63f..6f64acef73dc 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -317,7 +317,7 @@ static int tipc_aead_key_generate(struct tipc_aead_key *skey);
#define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \
do { \
- typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \
+ struct tipc_aead *__tmp = rcu_dereference_protected((rcu_ptr), \
lockdep_is_held(lock)); \
rcu_assign_pointer((rcu_ptr), (ptr)); \
tipc_aead_put(__tmp); \
@@ -798,7 +798,7 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb,
ehdr = (struct tipc_ehdr *)skb->data;
salt = aead->salt;
if (aead->mode == CLUSTER_KEY)
- salt ^= ehdr->addr; /* __be32 */
+ salt ^= __be32_to_cpu(ehdr->addr);
else if (__dnode)
salt ^= tipc_node_get_addr(__dnode);
memcpy(iv, &salt, 4);
@@ -929,7 +929,7 @@ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead,
ehdr = (struct tipc_ehdr *)skb->data;
salt = aead->salt;
if (aead->mode == CLUSTER_KEY)
- salt ^= ehdr->addr; /* __be32 */
+ salt ^= __be32_to_cpu(ehdr->addr);
else if (ehdr->destined)
salt ^= tipc_own_addr(net);
memcpy(iv, &salt, 4);
@@ -1946,16 +1946,16 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
goto rcv;
}
tipc_aead_put(aead);
- aead = tipc_aead_get(tmp);
+ aead = tipc_aead_get((struct tipc_aead __force __rcu *)tmp);
}
if (unlikely(err)) {
- tipc_aead_users_dec(aead, INT_MIN);
+ tipc_aead_users_dec((struct tipc_aead __force __rcu *)aead, INT_MIN);
goto free_skb;
}
/* Set the RX key's user */
- tipc_aead_users_set(aead, 1);
+ tipc_aead_users_set((struct tipc_aead __force __rcu *)aead, 1);
/* Mark this point, RX works */
rx->timer1 = jiffies;
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 48fac3b17e40..407619697292 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -104,6 +104,36 @@ static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id)
const int tipc_max_domain_size = sizeof(struct tipc_mon_domain);
+static inline u16 mon_cpu_to_le16(u16 val)
+{
+ return (__force __u16)htons(val);
+}
+
+static inline u32 mon_cpu_to_le32(u32 val)
+{
+ return (__force __u32)htonl(val);
+}
+
+static inline u64 mon_cpu_to_le64(u64 val)
+{
+ return (__force __u64)cpu_to_be64(val);
+}
+
+static inline u16 mon_le16_to_cpu(u16 val)
+{
+ return ntohs((__force __be16)val);
+}
+
+static inline u32 mon_le32_to_cpu(u32 val)
+{
+ return ntohl((__force __be32)val);
+}
+
+static inline u64 mon_le64_to_cpu(u64 val)
+{
+ return be64_to_cpu((__force __be64)val);
+}
+
/* dom_rec_len(): actual length of domain record for transport
*/
static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt)
@@ -260,16 +290,16 @@ static void mon_update_local_domain(struct tipc_monitor *mon)
diff |= dom->members[i] != peer->addr;
dom->members[i] = peer->addr;
map_set(&dom->up_map, i, peer->is_up);
- cache->members[i] = htonl(peer->addr);
+ cache->members[i] = mon_cpu_to_le32(peer->addr);
}
diff |= dom->up_map != prev_up_map;
if (!diff)
return;
dom->gen = ++mon->dom_gen;
- cache->len = htons(dom->len);
- cache->gen = htons(dom->gen);
- cache->member_cnt = htons(member_cnt);
- cache->up_map = cpu_to_be64(dom->up_map);
+ cache->len = mon_cpu_to_le16(dom->len);
+ cache->gen = mon_cpu_to_le16(dom->gen);
+ cache->member_cnt = mon_cpu_to_le16(member_cnt);
+ cache->up_map = mon_cpu_to_le64(dom->up_map);
mon_apply_domain(mon, self);
}
@@ -455,10 +485,11 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr,
struct tipc_mon_domain dom_bef;
struct tipc_mon_domain *dom;
struct tipc_peer *peer;
- u16 new_member_cnt = ntohs(arrv_dom->member_cnt);
+ u16 new_member_cnt = mon_le16_to_cpu(arrv_dom->member_cnt);
int new_dlen = dom_rec_len(arrv_dom, new_member_cnt);
- u16 new_gen = ntohs(arrv_dom->gen);
- u16 acked_gen = ntohs(arrv_dom->ack_gen);
+ u16 new_gen = mon_le16_to_cpu(arrv_dom->gen);
+ u16 acked_gen = mon_le16_to_cpu(arrv_dom->ack_gen);
+ u16 arrv_dlen = mon_le16_to_cpu(arrv_dom->len);
bool probing = state->probing;
int i, applied_bef;
@@ -469,7 +500,7 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr,
return;
if (dlen != dom_rec_len(arrv_dom, new_member_cnt))
return;
- if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen)
+ if (dlen < new_dlen || arrv_dlen != new_dlen)
return;
/* Synch generation numbers with peer if link just came up */
@@ -517,9 +548,9 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr,
dom->len = new_dlen;
dom->gen = new_gen;
dom->member_cnt = new_member_cnt;
- dom->up_map = be64_to_cpu(arrv_dom->up_map);
+ dom->up_map = mon_le64_to_cpu(arrv_dom->up_map);
for (i = 0; i < new_member_cnt; i++)
- dom->members[i] = ntohl(arrv_dom->members[i]);
+ dom->members[i] = mon_le32_to_cpu(arrv_dom->members[i]);
/* Update peers affected by this domain record */
applied_bef = peer->applied;
@@ -548,19 +579,19 @@ void tipc_mon_prep(struct net *net, void *data, int *dlen,
if (likely(state->acked_gen == gen)) {
len = dom_rec_len(dom, 0);
*dlen = len;
- dom->len = htons(len);
- dom->gen = htons(gen);
- dom->ack_gen = htons(state->peer_gen);
+ dom->len = mon_cpu_to_le16(len);
+ dom->gen = mon_cpu_to_le16(gen);
+ dom->ack_gen = mon_cpu_to_le16(state->peer_gen);
dom->member_cnt = 0;
return;
}
/* Send the full record */
read_lock_bh(&mon->lock);
- len = ntohs(mon->cache.len);
+ len = mon_le16_to_cpu(mon->cache.len);
*dlen = len;
memcpy(data, &mon->cache, len);
read_unlock_bh(&mon->lock);
- dom->ack_gen = htons(state->peer_gen);
+ dom->ack_gen = mon_cpu_to_le16(state->peer_gen);
}
void tipc_mon_get_state(struct net *net, u32 addr,
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index e9263280a2d4..3f0a25345a7c 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -707,8 +707,11 @@ bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy)
bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
{
struct tipc_msg *msg = buf_msg(skb);
- u32 dport, dnode;
- u32 onode = tipc_own_addr(net);
+ u32 scope = msg_lookup_scope(msg);
+ u32 self = tipc_own_addr(net);
+ u32 inst = msg_nameinst(msg);
+ struct tipc_socket_addr sk;
+ struct tipc_uaddr ua;
if (!msg_isdata(msg))
return false;
@@ -722,16 +725,16 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
msg = buf_msg(skb);
if (msg_reroute_cnt(msg))
return false;
- dnode = tipc_scope2node(net, msg_lookup_scope(msg));
- dport = tipc_nametbl_translate(net, msg_nametype(msg),
- msg_nameinst(msg), &dnode);
- if (!dport)
+ tipc_uaddr(&ua, TIPC_SERVICE_RANGE, scope,
+ msg_nametype(msg), inst, inst);
+ sk.node = tipc_scope2node(net, scope);
+ if (!tipc_nametbl_lookup_anycast(net, &ua, &sk))
return false;
msg_incr_reroute_cnt(msg);
- if (dnode != onode)
- msg_set_prevnode(msg, onode);
- msg_set_destnode(msg, dnode);
- msg_set_destport(msg, dport);
+ if (sk.node != self)
+ msg_set_prevnode(msg, self);
+ msg_set_destnode(msg, sk.node);
+ msg_set_destport(msg, sk.ref);
*err = TIPC_OK;
return true;
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 6cf57c3bfa27..bda902caa814 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -1,8 +1,9 @@
/*
* net/tipc/name_distr.c: TIPC name distribution code
*
- * Copyright (c) 2000-2006, 2014, Ericsson AB
+ * Copyright (c) 2000-2006, 2014-2019, Ericsson AB
* Copyright (c) 2005, 2010-2011, Wind River Systems
+ * Copyright (c) 2020-2021, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -55,10 +56,10 @@ struct distr_queue_item {
*/
static void publ_to_item(struct distr_item *i, struct publication *p)
{
- i->type = htonl(p->type);
- i->lower = htonl(p->lower);
- i->upper = htonl(p->upper);
- i->port = htonl(p->port);
+ i->type = htonl(p->sr.type);
+ i->lower = htonl(p->sr.lower);
+ i->upper = htonl(p->sr.upper);
+ i->port = htonl(p->sk.ref);
i->key = htonl(p->key);
}
@@ -90,20 +91,20 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
/**
* tipc_named_publish - tell other nodes about a new publication by this node
* @net: the associated network namespace
- * @publ: the new publication
+ * @p: the new publication
*/
-struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
+struct sk_buff *tipc_named_publish(struct net *net, struct publication *p)
{
struct name_table *nt = tipc_name_table(net);
struct distr_item *item;
struct sk_buff *skb;
- if (publ->scope == TIPC_NODE_SCOPE) {
- list_add_tail_rcu(&publ->binding_node, &nt->node_scope);
+ if (p->scope == TIPC_NODE_SCOPE) {
+ list_add_tail_rcu(&p->binding_node, &nt->node_scope);
return NULL;
}
write_lock_bh(&nt->cluster_scope_lock);
- list_add_tail(&publ->binding_node, &nt->cluster_scope);
+ list_add_tail(&p->binding_node, &nt->cluster_scope);
write_unlock_bh(&nt->cluster_scope_lock);
skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0);
if (!skb) {
@@ -113,25 +114,25 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++);
msg_set_non_legacy(buf_msg(skb));
item = (struct distr_item *)msg_data(buf_msg(skb));
- publ_to_item(item, publ);
+ publ_to_item(item, p);
return skb;
}
/**
* tipc_named_withdraw - tell other nodes about a withdrawn publication by this node
* @net: the associated network namespace
- * @publ: the withdrawn publication
+ * @p: the withdrawn publication
*/
-struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ)
+struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *p)
{
struct name_table *nt = tipc_name_table(net);
struct distr_item *item;
struct sk_buff *skb;
write_lock_bh(&nt->cluster_scope_lock);
- list_del(&publ->binding_node);
+ list_del(&p->binding_node);
write_unlock_bh(&nt->cluster_scope_lock);
- if (publ->scope == TIPC_NODE_SCOPE)
+ if (p->scope == TIPC_NODE_SCOPE)
return NULL;
skb = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0);
@@ -142,7 +143,7 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ)
msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++);
msg_set_non_legacy(buf_msg(skb));
item = (struct distr_item *)msg_data(buf_msg(skb));
- publ_to_item(item, publ);
+ publ_to_item(item, p);
return skb;
}
@@ -233,33 +234,27 @@ void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities)
/**
* tipc_publ_purge - remove publication associated with a failed node
* @net: the associated network namespace
- * @publ: the publication to remove
+ * @p: the publication to remove
* @addr: failed node's address
*
* Invoked for each publication issued by a newly failed node.
* Removes publication structure from name table & deletes it.
*/
-static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr)
+static void tipc_publ_purge(struct net *net, struct publication *p, u32 addr)
{
struct tipc_net *tn = tipc_net(net);
- struct publication *p;
+ struct publication *_p;
+ struct tipc_uaddr ua;
+ tipc_uaddr(&ua, TIPC_SERVICE_RANGE, p->scope, p->sr.type,
+ p->sr.lower, p->sr.upper);
spin_lock_bh(&tn->nametbl_lock);
- p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, publ->upper,
- publ->node, publ->key);
- if (p)
- tipc_node_unsubscribe(net, &p->binding_node, addr);
+ _p = tipc_nametbl_remove_publ(net, &ua, &p->sk, p->key);
+ if (_p)
+ tipc_node_unsubscribe(net, &_p->binding_node, addr);
spin_unlock_bh(&tn->nametbl_lock);
-
- if (p != publ) {
- pr_err("Unable to remove publication from failed node\n"
- " (type=%u, lower=%u, node=0x%x, port=%u, key=%u)\n",
- publ->type, publ->lower, publ->node, publ->port,
- publ->key);
- }
-
- if (p)
- kfree_rcu(p, rcu);
+ if (_p)
+ kfree_rcu(_p, rcu);
}
void tipc_publ_notify(struct net *net, struct list_head *nsub_list,
@@ -293,30 +288,30 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
u32 node, u32 dtype)
{
struct publication *p = NULL;
- u32 lower = ntohl(i->lower);
- u32 upper = ntohl(i->upper);
- u32 type = ntohl(i->type);
- u32 port = ntohl(i->port);
+ struct tipc_socket_addr sk;
+ struct tipc_uaddr ua;
u32 key = ntohl(i->key);
+ tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE,
+ ntohl(i->type), ntohl(i->lower), ntohl(i->upper));
+ sk.ref = ntohl(i->port);
+ sk.node = node;
+
if (dtype == PUBLICATION) {
- p = tipc_nametbl_insert_publ(net, type, lower, upper,
- TIPC_CLUSTER_SCOPE, node,
- port, key);
+ p = tipc_nametbl_insert_publ(net, &ua, &sk, key);
if (p) {
tipc_node_subscribe(net, &p->binding_node, node);
return true;
}
} else if (dtype == WITHDRAWAL) {
- p = tipc_nametbl_remove_publ(net, type, lower,
- upper, node, key);
+ p = tipc_nametbl_remove_publ(net, &ua, &sk, key);
if (p) {
tipc_node_unsubscribe(net, &p->binding_node, node);
kfree_rcu(p, rcu);
return true;
}
- pr_warn_ratelimited("Failed to remove binding %u,%u from %x\n",
- type, lower, node);
+ pr_warn_ratelimited("Failed to remove binding %u,%u from %u\n",
+ ua.sr.type, ua.sr.lower, node);
} else {
pr_warn("Unrecognized name table message received\n");
}
@@ -410,15 +405,15 @@ void tipc_named_reinit(struct net *net)
{
struct name_table *nt = tipc_name_table(net);
struct tipc_net *tn = tipc_net(net);
- struct publication *publ;
+ struct publication *p;
u32 self = tipc_own_addr(net);
spin_lock_bh(&tn->nametbl_lock);
- list_for_each_entry_rcu(publ, &nt->node_scope, binding_node)
- publ->node = self;
- list_for_each_entry_rcu(publ, &nt->cluster_scope, binding_node)
- publ->node = self;
+ list_for_each_entry_rcu(p, &nt->node_scope, binding_node)
+ p->sk.node = self;
+ list_for_each_entry_rcu(p, &nt->cluster_scope, binding_node)
+ p->sk.node = self;
nt->rc_dests = 0;
spin_unlock_bh(&tn->nametbl_lock);
}
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index ee5ac40ea2b6..6db9f9e7c0ac 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -3,7 +3,7 @@
*
* Copyright (c) 2000-2006, 2014-2018, Ericsson AB
* Copyright (c) 2004-2008, 2010-2014, Wind River Systems
- * Copyright (c) 2020, Red Hat Inc
+ * Copyright (c) 2020-2021, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -222,59 +222,57 @@ static int hash(int x)
/**
* tipc_publ_create - create a publication structure
- * @type: name sequence type
- * @lower: name sequence lower bound
- * @upper: name sequence upper bound
- * @scope: publication scope
- * @node: network address of publishing socket
- * @port: publishing port
+ * @ua: the service range the user is binding to
+ * @sk: the address of the socket that is bound
* @key: publication key
*/
-static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper,
- u32 scope, u32 node, u32 port,
+static struct publication *tipc_publ_create(struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk,
u32 key)
{
- struct publication *publ = kzalloc(sizeof(*publ), GFP_ATOMIC);
+ struct publication *p = kzalloc(sizeof(*p), GFP_ATOMIC);
- if (!publ)
+ if (!p)
return NULL;
- publ->type = type;
- publ->lower = lower;
- publ->upper = upper;
- publ->scope = scope;
- publ->node = node;
- publ->port = port;
- publ->key = key;
- INIT_LIST_HEAD(&publ->binding_sock);
- INIT_LIST_HEAD(&publ->binding_node);
- INIT_LIST_HEAD(&publ->local_publ);
- INIT_LIST_HEAD(&publ->all_publ);
- INIT_LIST_HEAD(&publ->list);
- return publ;
+ p->sr = ua->sr;
+ p->sk = *sk;
+ p->scope = ua->scope;
+ p->key = key;
+ INIT_LIST_HEAD(&p->binding_sock);
+ INIT_LIST_HEAD(&p->binding_node);
+ INIT_LIST_HEAD(&p->local_publ);
+ INIT_LIST_HEAD(&p->all_publ);
+ INIT_LIST_HEAD(&p->list);
+ return p;
}
/**
* tipc_service_create - create a service structure for the specified 'type'
- * @type: service type
- * @hd: name_table services list
+ * @net: network namespace
+ * @ua: address representing the service to be bound
*
* Allocates a single range structure and sets it to all 0's.
*/
-static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd)
+static struct tipc_service *tipc_service_create(struct net *net,
+ struct tipc_uaddr *ua)
{
- struct tipc_service *service = kzalloc(sizeof(*service), GFP_ATOMIC);
+ struct name_table *nt = tipc_name_table(net);
+ struct tipc_service *service;
+ struct hlist_head *hd;
+ service = kzalloc(sizeof(*service), GFP_ATOMIC);
if (!service) {
pr_warn("Service creation failed, no memory\n");
return NULL;
}
spin_lock_init(&service->lock);
- service->type = type;
+ service->type = ua->sr.type;
service->ranges = RB_ROOT;
INIT_HLIST_NODE(&service->service_list);
INIT_LIST_HEAD(&service->subscriptions);
+ hd = &nt->services[hash(ua->sr.type)];
hlist_add_head_rcu(&service->service_list, hd);
return service;
}
@@ -282,13 +280,13 @@ static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd)
/* tipc_service_find_range - find service range matching publication parameters
*/
static struct service_range *tipc_service_find_range(struct tipc_service *sc,
- u32 lower, u32 upper)
+ struct tipc_uaddr *ua)
{
struct service_range *sr;
- service_range_foreach_match(sr, sc, lower, upper) {
+ service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) {
/* Look for exact match */
- if (sr->lower == lower && sr->upper == upper)
+ if (sr->lower == ua->sr.lower && sr->upper == ua->sr.upper)
return sr;
}
@@ -296,10 +294,12 @@ static struct service_range *tipc_service_find_range(struct tipc_service *sc,
}
static struct service_range *tipc_service_create_range(struct tipc_service *sc,
- u32 lower, u32 upper)
+ struct publication *p)
{
struct rb_node **n, *parent = NULL;
struct service_range *sr;
+ u32 lower = p->sr.lower;
+ u32 upper = p->sr.upper;
n = &sc->ranges.rb_node;
while (*n) {
@@ -327,64 +327,68 @@ static struct service_range *tipc_service_create_range(struct tipc_service *sc,
return sr;
}
-static struct publication *tipc_service_insert_publ(struct net *net,
- struct tipc_service *sc,
- u32 type, u32 lower,
- u32 upper, u32 scope,
- u32 node, u32 port,
- u32 key)
+static bool tipc_service_insert_publ(struct net *net,
+ struct tipc_service *sc,
+ struct publication *p)
{
struct tipc_subscription *sub, *tmp;
struct service_range *sr;
- struct publication *p;
+ struct publication *_p;
+ u32 node = p->sk.node;
bool first = false;
+ bool res = false;
+ u32 key = p->key;
- sr = tipc_service_create_range(sc, lower, upper);
+ spin_lock_bh(&sc->lock);
+ sr = tipc_service_create_range(sc, p);
if (!sr)
- goto err;
+ goto exit;
first = list_empty(&sr->all_publ);
/* Return if the publication already exists */
- list_for_each_entry(p, &sr->all_publ, all_publ) {
- if (p->key == key && (!p->node || p->node == node))
- return NULL;
+ list_for_each_entry(_p, &sr->all_publ, all_publ) {
+ if (_p->key == key && (!_p->sk.node || _p->sk.node == node)) {
+ pr_debug("Failed to bind duplicate %u,%u,%u/%u:%u/%u\n",
+ p->sr.type, p->sr.lower, p->sr.upper,
+ node, p->sk.ref, key);
+ goto exit;
+ }
}
- /* Create and insert publication */
- p = tipc_publ_create(type, lower, upper, scope, node, port, key);
- if (!p)
- goto err;
- /* Suppose there shouldn't be a huge gap btw publs i.e. >INT_MAX */
- p->id = sc->publ_cnt++;
- if (in_own_node(net, node))
+ if (in_own_node(net, p->sk.node))
list_add(&p->local_publ, &sr->local_publ);
list_add(&p->all_publ, &sr->all_publ);
+ p->id = sc->publ_cnt++;
/* Any subscriptions waiting for notification? */
list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
- tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_PUBLISHED,
- p->port, p->node, p->scope, first);
+ tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, first);
}
- return p;
-err:
- pr_warn("Failed to bind to %u,%u,%u, no memory\n", type, lower, upper);
- return NULL;
+ res = true;
+exit:
+ if (!res)
+ pr_warn("Failed to bind to %u,%u,%u\n",
+ p->sr.type, p->sr.lower, p->sr.upper);
+ spin_unlock_bh(&sc->lock);
+ return res;
}
/**
* tipc_service_remove_publ - remove a publication from a service
- * @sr: service_range to remove publication from
- * @node: target node
+ * @r: service_range to remove publication from
+ * @sk: address publishing socket
* @key: target publication key
*/
-static struct publication *tipc_service_remove_publ(struct service_range *sr,
- u32 node, u32 key)
+static struct publication *tipc_service_remove_publ(struct service_range *r,
+ struct tipc_socket_addr *sk,
+ u32 key)
{
struct publication *p;
+ u32 node = sk->node;
- list_for_each_entry(p, &sr->all_publ, all_publ) {
- if (p->key != key || (node && node != p->node))
+ list_for_each_entry(p, &r->all_publ, all_publ) {
+ if (p->key != key || (node && node != p->sk.node))
continue;
list_del(&p->all_publ);
list_del(&p->local_publ);
@@ -417,17 +421,14 @@ static int tipc_publ_sort(void *priv, struct list_head *a,
static void tipc_service_subscribe(struct tipc_service *service,
struct tipc_subscription *sub)
{
- struct tipc_subscr *sb = &sub->evt.s;
struct publication *p, *first, *tmp;
struct list_head publ_list;
struct service_range *sr;
- struct tipc_service_range r;
- u32 filter;
+ u32 filter, lower, upper;
- r.type = tipc_sub_read(sb, seq.type);
- r.lower = tipc_sub_read(sb, seq.lower);
- r.upper = tipc_sub_read(sb, seq.upper);
- filter = tipc_sub_read(sb, filter);
+ filter = sub->s.filter;
+ lower = sub->s.seq.lower;
+ upper = sub->s.seq.upper;
tipc_sub_get(sub);
list_add(&sub->service_list, &service->subscriptions);
@@ -436,7 +437,7 @@ static void tipc_service_subscribe(struct tipc_service *service,
return;
INIT_LIST_HEAD(&publ_list);
- service_range_foreach_match(sr, service, r.lower, r.upper) {
+ service_range_foreach_match(sr, service, lower, upper) {
first = NULL;
list_for_each_entry(p, &sr->all_publ, all_publ) {
if (filter & TIPC_SUB_PORTS)
@@ -452,80 +453,74 @@ static void tipc_service_subscribe(struct tipc_service *service,
/* Sort the publications before reporting */
list_sort(NULL, &publ_list, tipc_publ_sort);
list_for_each_entry_safe(p, tmp, &publ_list, list) {
- tipc_sub_report_overlap(sub, p->lower, p->upper,
- TIPC_PUBLISHED, p->port, p->node,
- p->scope, true);
+ tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, true);
list_del_init(&p->list);
}
}
-static struct tipc_service *tipc_service_find(struct net *net, u32 type)
+static struct tipc_service *tipc_service_find(struct net *net,
+ struct tipc_uaddr *ua)
{
struct name_table *nt = tipc_name_table(net);
struct hlist_head *service_head;
struct tipc_service *service;
- service_head = &nt->services[hash(type)];
+ service_head = &nt->services[hash(ua->sr.type)];
hlist_for_each_entry_rcu(service, service_head, service_list) {
- if (service->type == type)
+ if (service->type == ua->sr.type)
return service;
}
return NULL;
};
-struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
- u32 lower, u32 upper,
- u32 scope, u32 node,
- u32 port, u32 key)
+struct publication *tipc_nametbl_insert_publ(struct net *net,
+ struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk,
+ u32 key)
{
- struct name_table *nt = tipc_name_table(net);
struct tipc_service *sc;
struct publication *p;
- if (scope > TIPC_NODE_SCOPE || lower > upper) {
- pr_debug("Failed to bind illegal {%u,%u,%u} with scope %u\n",
- type, lower, upper, scope);
- return NULL;
- }
- sc = tipc_service_find(net, type);
- if (!sc)
- sc = tipc_service_create(type, &nt->services[hash(type)]);
- if (!sc)
+ p = tipc_publ_create(ua, sk, key);
+ if (!p)
return NULL;
- spin_lock_bh(&sc->lock);
- p = tipc_service_insert_publ(net, sc, type, lower, upper,
- scope, node, port, key);
- spin_unlock_bh(&sc->lock);
- return p;
+ sc = tipc_service_find(net, ua);
+ if (!sc)
+ sc = tipc_service_create(net, ua);
+ if (sc && tipc_service_insert_publ(net, sc, p))
+ return p;
+ kfree(p);
+ return NULL;
}
-struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
- u32 lower, u32 upper,
- u32 node, u32 key)
+struct publication *tipc_nametbl_remove_publ(struct net *net,
+ struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk,
+ u32 key)
{
- struct tipc_service *sc = tipc_service_find(net, type);
struct tipc_subscription *sub, *tmp;
- struct service_range *sr = NULL;
struct publication *p = NULL;
+ struct service_range *sr;
+ struct tipc_service *sc;
bool last;
+ sc = tipc_service_find(net, ua);
if (!sc)
- return NULL;
+ goto exit;
spin_lock_bh(&sc->lock);
- sr = tipc_service_find_range(sc, lower, upper);
+ sr = tipc_service_find_range(sc, ua);
if (!sr)
- goto exit;
- p = tipc_service_remove_publ(sr, node, key);
+ goto unlock;
+ p = tipc_service_remove_publ(sr, sk, key);
if (!p)
- goto exit;
+ goto unlock;
/* Notify any waiting subscriptions */
last = list_empty(&sr->all_publ);
list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
- tipc_sub_report_overlap(sub, lower, upper, TIPC_WITHDRAWN,
- p->port, node, p->scope, last);
+ tipc_sub_report_overlap(sub, p, TIPC_WITHDRAWN, last);
}
/* Remove service range item if this was its last publication */
@@ -534,77 +529,83 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
kfree(sr);
}
- /* Delete service item if this no more publications and subscriptions */
+ /* Delete service item if no more publications and subscriptions */
if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) {
hlist_del_init_rcu(&sc->service_list);
kfree_rcu(sc, rcu);
}
-exit:
+unlock:
spin_unlock_bh(&sc->lock);
+exit:
+ if (!p) {
+ pr_err("Failed to remove unknown binding: %u,%u,%u/%u:%u/%u\n",
+ ua->sr.type, ua->sr.lower, ua->sr.upper,
+ sk->node, sk->ref, key);
+ }
return p;
}
/**
- * tipc_nametbl_translate - perform service instance to socket translation
+ * tipc_nametbl_lookup_anycast - perform service instance to socket translation
* @net: network namespace
- * @type: message type
- * @instance: message instance
- * @dnode: the search domain used during translation
+ * @ua: service address to look up
+ * @sk: address to socket we want to find
*
+ * On entry, a non-zero 'sk->node' indicates the node where we want lookup to be
+ * performed, which may not be this one.
* On exit:
- * - if translation is deferred to another node, leave 'dnode' unchanged and
- * return 0
- * - if translation is attempted and succeeds, set 'dnode' to the publishing
- * node and return the published (non-zero) port number
- * - if translation is attempted and fails, set 'dnode' to 0 and return 0
+ * - If lookup is deferred to another node, leave 'sk->node' unchanged and
+ * return 'true'.
+ * - If lookup is successful, set the 'sk->node' and 'sk->ref' (== portid) which
+ * represent the bound socket and return 'true'.
+ * - If lookup fails, return 'false'
*
* Note that for legacy users (node configured with Z.C.N address format) the
- * 'closest-first' lookup algorithm must be maintained, i.e., if dnode is 0
+ * 'closest-first' lookup algorithm must be maintained, i.e., if sk.node is 0
* we must look in the local binding list first
*/
-u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode)
+bool tipc_nametbl_lookup_anycast(struct net *net,
+ struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk)
{
struct tipc_net *tn = tipc_net(net);
bool legacy = tn->legacy_addr_format;
u32 self = tipc_own_addr(net);
- struct service_range *sr;
+ u32 inst = ua->sa.instance;
+ struct service_range *r;
struct tipc_service *sc;
- struct list_head *list;
struct publication *p;
- u32 port = 0;
- u32 node = 0;
+ struct list_head *l;
+ bool res = false;
- if (!tipc_in_scope(legacy, *dnode, self))
- return 0;
+ if (!tipc_in_scope(legacy, sk->node, self))
+ return true;
rcu_read_lock();
- sc = tipc_service_find(net, type);
+ sc = tipc_service_find(net, ua);
if (unlikely(!sc))
goto exit;
spin_lock_bh(&sc->lock);
- service_range_foreach_match(sr, sc, instance, instance) {
+ service_range_foreach_match(r, sc, inst, inst) {
/* Select lookup algo: local, closest-first or round-robin */
- if (*dnode == self) {
- list = &sr->local_publ;
- if (list_empty(list))
+ if (sk->node == self) {
+ l = &r->local_publ;
+ if (list_empty(l))
continue;
- p = list_first_entry(list, struct publication,
- local_publ);
- list_move_tail(&p->local_publ, &sr->local_publ);
- } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) {
- list = &sr->local_publ;
- p = list_first_entry(list, struct publication,
- local_publ);
- list_move_tail(&p->local_publ, &sr->local_publ);
+ p = list_first_entry(l, struct publication, local_publ);
+ list_move_tail(&p->local_publ, &r->local_publ);
+ } else if (legacy && !sk->node && !list_empty(&r->local_publ)) {
+ l = &r->local_publ;
+ p = list_first_entry(l, struct publication, local_publ);
+ list_move_tail(&p->local_publ, &r->local_publ);
} else {
- list = &sr->all_publ;
- p = list_first_entry(list, struct publication,
- all_publ);
- list_move_tail(&p->all_publ, &sr->all_publ);
+ l = &r->all_publ;
+ p = list_first_entry(l, struct publication, all_publ);
+ list_move_tail(&p->all_publ, &r->all_publ);
}
- port = p->port;
- node = p->node;
+ *sk = p->sk;
+ res = true;
/* Todo: as for legacy, pick the first matching range only, a
* "true" round-robin will be performed as needed.
*/
@@ -614,40 +615,45 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode)
exit:
rcu_read_unlock();
- *dnode = node;
- return port;
+ return res;
}
-bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
- struct list_head *dsts, int *dstcnt, u32 exclude,
- bool all)
+/* tipc_nametbl_lookup_group(): lookup destinaton(s) in a communication group
+ * Returns a list of one (== group anycast) or more (== group multicast)
+ * destination socket/node pairs matching the given address.
+ * The requester may or may not want to exclude himself from the list.
+ */
+bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua,
+ struct list_head *dsts, int *dstcnt,
+ u32 exclude, bool mcast)
{
u32 self = tipc_own_addr(net);
+ u32 inst = ua->sa.instance;
struct service_range *sr;
struct tipc_service *sc;
struct publication *p;
*dstcnt = 0;
rcu_read_lock();
- sc = tipc_service_find(net, type);
+ sc = tipc_service_find(net, ua);
if (unlikely(!sc))
goto exit;
spin_lock_bh(&sc->lock);
/* Todo: a full search i.e. service_range_foreach_match() instead? */
- sr = service_range_match_first(sc->ranges.rb_node, instance, instance);
+ sr = service_range_match_first(sc->ranges.rb_node, inst, inst);
if (!sr)
goto no_match;
list_for_each_entry(p, &sr->all_publ, all_publ) {
- if (p->scope != scope)
+ if (p->scope != ua->scope)
continue;
- if (p->port == exclude && p->node == self)
+ if (p->sk.ref == exclude && p->sk.node == self)
continue;
- tipc_dest_push(dsts, p->node, p->port);
+ tipc_dest_push(dsts, p->sk.node, p->sk.ref);
(*dstcnt)++;
- if (all)
+ if (mcast)
continue;
list_move_tail(&p->all_publ, &sr->all_publ);
break;
@@ -659,23 +665,29 @@ exit:
return !list_empty(dsts);
}
-void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
- u32 scope, bool exact, struct list_head *dports)
+/* tipc_nametbl_lookup_mcast_sockets(): look up node local destinaton sockets
+ * matching the given address
+ * Used on nodes which have received a multicast/broadcast message
+ * Returns a list of local sockets
+ */
+void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua,
+ bool exact, struct list_head *dports)
{
struct service_range *sr;
struct tipc_service *sc;
struct publication *p;
+ u32 scope = ua->scope;
rcu_read_lock();
- sc = tipc_service_find(net, type);
+ sc = tipc_service_find(net, ua);
if (!sc)
goto exit;
spin_lock_bh(&sc->lock);
- service_range_foreach_match(sr, sc, lower, upper) {
+ service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) {
list_for_each_entry(p, &sr->local_publ, local_publ) {
if (p->scope == scope || (!exact && p->scope < scope))
- tipc_dest_push(dports, 0, p->port);
+ tipc_dest_push(dports, 0, p->sk.ref);
}
}
spin_unlock_bh(&sc->lock);
@@ -683,26 +695,27 @@ exit:
rcu_read_unlock();
}
-/* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes
- * - Creates list of nodes that overlap the given multicast address
- * - Determines if any node local destinations overlap
+/* tipc_nametbl_lookup_mcast_nodes(): look up all destination nodes matching
+ * the given address. Used in sending node.
+ * Used on nodes which are sending out a multicast/broadcast message
+ * Returns a list of nodes, including own node if applicable
*/
-void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
- u32 upper, struct tipc_nlist *nodes)
+void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua,
+ struct tipc_nlist *nodes)
{
struct service_range *sr;
struct tipc_service *sc;
struct publication *p;
rcu_read_lock();
- sc = tipc_service_find(net, type);
+ sc = tipc_service_find(net, ua);
if (!sc)
goto exit;
spin_lock_bh(&sc->lock);
- service_range_foreach_match(sr, sc, lower, upper) {
+ service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) {
list_for_each_entry(p, &sr->all_publ, all_publ) {
- tipc_nlist_add(nodes, p->node);
+ tipc_nlist_add(nodes, p->sk.node);
}
}
spin_unlock_bh(&sc->lock);
@@ -713,7 +726,7 @@ exit:
/* tipc_nametbl_build_group - build list of communication group members
*/
void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
- u32 type, u32 scope)
+ struct tipc_uaddr *ua)
{
struct service_range *sr;
struct tipc_service *sc;
@@ -721,7 +734,7 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
struct rb_node *n;
rcu_read_lock();
- sc = tipc_service_find(net, type);
+ sc = tipc_service_find(net, ua);
if (!sc)
goto exit;
@@ -729,9 +742,10 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
for (n = rb_first(&sc->ranges); n; n = rb_next(n)) {
sr = container_of(n, struct service_range, tree_node);
list_for_each_entry(p, &sr->all_publ, all_publ) {
- if (p->scope != scope)
+ if (p->scope != ua->scope)
continue;
- tipc_group_add_member(grp, p->node, p->port, p->lower);
+ tipc_group_add_member(grp, p->sk.node, p->sk.ref,
+ p->sr.lower);
}
}
spin_unlock_bh(&sc->lock);
@@ -741,9 +755,8 @@ exit:
/* tipc_nametbl_publish - add service binding to name table
*/
-struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
- u32 upper, u32 scope, u32 port,
- u32 key)
+struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk, u32 key)
{
struct name_table *nt = tipc_name_table(net);
struct tipc_net *tn = tipc_net(net);
@@ -758,8 +771,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
goto exit;
}
- p = tipc_nametbl_insert_publ(net, type, lower, upper, scope,
- tipc_own_addr(net), port, key);
+ p = tipc_nametbl_insert_publ(net, ua, sk, key);
if (p) {
nt->local_publ_count++;
skb = tipc_named_publish(net, p);
@@ -777,41 +789,33 @@ exit:
/**
* tipc_nametbl_withdraw - withdraw a service binding
* @net: network namespace
- * @type: service type
- * @lower: service range lower bound
- * @upper: service range upper bound
+ * @ua: service address/range being unbound
+ * @sk: address of the socket being unbound from
* @key: target publication key
*/
-int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower,
- u32 upper, u32 key)
+void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk, u32 key)
{
struct name_table *nt = tipc_name_table(net);
struct tipc_net *tn = tipc_net(net);
- u32 self = tipc_own_addr(net);
struct sk_buff *skb = NULL;
struct publication *p;
u32 rc_dests;
spin_lock_bh(&tn->nametbl_lock);
- p = tipc_nametbl_remove_publ(net, type, lower, upper, self, key);
+ p = tipc_nametbl_remove_publ(net, ua, sk, key);
if (p) {
nt->local_publ_count--;
skb = tipc_named_withdraw(net, p);
list_del_init(&p->binding_sock);
kfree_rcu(p, rcu);
- } else {
- pr_err("Failed to remove local publication {%u,%u,%u}/%u\n",
- type, lower, upper, key);
}
rc_dests = nt->rc_dests;
spin_unlock_bh(&tn->nametbl_lock);
- if (skb) {
+ if (skb)
tipc_node_broadcast(net, skb, rc_dests);
- return 1;
- }
- return 0;
}
/**
@@ -820,25 +824,25 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower,
*/
bool tipc_nametbl_subscribe(struct tipc_subscription *sub)
{
- struct name_table *nt = tipc_name_table(sub->net);
struct tipc_net *tn = tipc_net(sub->net);
- struct tipc_subscr *s = &sub->evt.s;
- u32 type = tipc_sub_read(s, seq.type);
+ u32 type = sub->s.seq.type;
struct tipc_service *sc;
+ struct tipc_uaddr ua;
bool res = true;
+ tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, type,
+ sub->s.seq.lower, sub->s.seq.upper);
spin_lock_bh(&tn->nametbl_lock);
- sc = tipc_service_find(sub->net, type);
+ sc = tipc_service_find(sub->net, &ua);
if (!sc)
- sc = tipc_service_create(type, &nt->services[hash(type)]);
+ sc = tipc_service_create(sub->net, &ua);
if (sc) {
spin_lock_bh(&sc->lock);
tipc_service_subscribe(sc, sub);
spin_unlock_bh(&sc->lock);
} else {
- pr_warn("Failed to subscribe for {%u,%u,%u}\n", type,
- tipc_sub_read(s, seq.lower),
- tipc_sub_read(s, seq.upper));
+ pr_warn("Failed to subscribe for {%u,%u,%u}\n",
+ type, sub->s.seq.lower, sub->s.seq.upper);
res = false;
}
spin_unlock_bh(&tn->nametbl_lock);
@@ -852,12 +856,13 @@ bool tipc_nametbl_subscribe(struct tipc_subscription *sub)
void tipc_nametbl_unsubscribe(struct tipc_subscription *sub)
{
struct tipc_net *tn = tipc_net(sub->net);
- struct tipc_subscr *s = &sub->evt.s;
- u32 type = tipc_sub_read(s, seq.type);
struct tipc_service *sc;
+ struct tipc_uaddr ua;
+ tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE,
+ sub->s.seq.type, sub->s.seq.lower, sub->s.seq.upper);
spin_lock_bh(&tn->nametbl_lock);
- sc = tipc_service_find(sub->net, type);
+ sc = tipc_service_find(sub->net, &ua);
if (!sc)
goto exit;
@@ -909,7 +914,7 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc)
spin_lock_bh(&sc->lock);
rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) {
- tipc_service_remove_publ(sr, p->node, p->key);
+ tipc_service_remove_publ(sr, &p->sk, p->key);
kfree_rcu(p, rcu);
}
rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks);
@@ -993,9 +998,9 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg,
goto publ_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope))
goto publ_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->node))
+ if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->sk.node))
goto publ_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->port))
+ if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->sk.ref))
goto publ_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key))
goto publ_msg_full;
@@ -1046,6 +1051,7 @@ static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg,
struct tipc_net *tn = tipc_net(net);
struct tipc_service *service = NULL;
struct hlist_head *head;
+ struct tipc_uaddr ua;
int err;
int i;
@@ -1059,7 +1065,9 @@ static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg,
if (*last_type ||
(!i && *last_key && (*last_lower == *last_key))) {
- service = tipc_service_find(net, *last_type);
+ tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE,
+ *last_type, *last_lower, *last_lower);
+ service = tipc_service_find(net, &ua);
if (!service)
return -EPIPE;
} else {
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 5a82a01369d6..c7c9a3ddd420 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -3,6 +3,7 @@
*
* Copyright (c) 2000-2006, 2014-2018, Ericsson AB
* Copyright (c) 2004-2005, 2010-2011, Wind River Systems
+ * Copyright (c) 2020-2021, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -41,6 +42,7 @@ struct tipc_subscription;
struct tipc_plist;
struct tipc_nlist;
struct tipc_group;
+struct tipc_uaddr;
/*
* TIPC name types reserved for internal TIPC use (both current and planned)
@@ -50,13 +52,10 @@ struct tipc_group;
#define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */
/**
- * struct publication - info about a published (name or) name sequence
- * @type: name sequence type
- * @lower: name sequence lower bound
- * @upper: name sequence upper bound
+ * struct publication - info about a published service address or range
+ * @sr: service range represented by this publication
+ * @sk: address of socket bound to this publication
* @scope: scope of publication, TIPC_NODE_SCOPE or TIPC_CLUSTER_SCOPE
- * @node: network address of publishing socket's node
- * @port: publishing port
* @key: publication key, unique across the cluster
* @id: publication id
* @binding_node: all publications from the same node which bound this one
@@ -74,12 +73,9 @@ struct tipc_group;
* @rcu: RCU callback head used for deferred freeing
*/
struct publication {
- u32 type;
- u32 lower;
- u32 upper;
- u32 scope;
- u32 node;
- u32 port;
+ struct tipc_service_range sr;
+ struct tipc_socket_addr sk;
+ u16 scope;
u32 key;
u32 id;
struct list_head binding_node;
@@ -114,28 +110,29 @@ struct name_table {
};
int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
-
-u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
-void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
- u32 scope, bool exact, struct list_head *dports);
+bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk);
+void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua,
+ bool exact, struct list_head *dports);
+void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua,
+ struct tipc_nlist *nodes);
+bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua,
+ struct list_head *dsts, int *dstcnt,
+ u32 exclude, bool mcast);
void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
- u32 type, u32 domain);
-void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
- u32 upper, struct tipc_nlist *nodes);
-bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain,
- struct list_head *dsts, int *dstcnt, u32 exclude,
- bool all);
-struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
- u32 upper, u32 scope, u32 port,
- u32 key);
-int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 upper,
- u32 key);
-struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
- u32 lower, u32 upper, u32 scope,
- u32 node, u32 ref, u32 key);
-struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
- u32 lower, u32 upper,
- u32 node, u32 key);
+ struct tipc_uaddr *ua);
+struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk, u32 key);
+void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk, u32 key);
+struct publication *tipc_nametbl_insert_publ(struct net *net,
+ struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk,
+ u32 key);
+struct publication *tipc_nametbl_remove_publ(struct net *net,
+ struct tipc_uaddr *ua,
+ struct tipc_socket_addr *sk,
+ u32 key);
bool tipc_nametbl_subscribe(struct tipc_subscription *s);
void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
int tipc_nametbl_init(struct net *net);
diff --git a/net/tipc/net.c b/net/tipc/net.c
index a129f661bee3..3f927949bb23 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -125,6 +125,11 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr)
static void tipc_net_finalize(struct net *net, u32 addr)
{
struct tipc_net *tn = tipc_net(net);
+ struct tipc_socket_addr sk = {0, addr};
+ struct tipc_uaddr ua;
+
+ tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE,
+ TIPC_NODE_STATE, addr, addr);
if (cmpxchg(&tn->node_addr, 0, addr))
return;
@@ -132,8 +137,7 @@ static void tipc_net_finalize(struct net *net, u32 addr)
tipc_named_reinit(net);
tipc_sk_reinit(net);
tipc_mon_reinit_self(net);
- tipc_nametbl_publish(net, TIPC_NODE_STATE, addr, addr,
- TIPC_CLUSTER_SCOPE, 0, addr);
+ tipc_nametbl_publish(net, &ua, &sk, addr);
}
void tipc_net_finalize_work(struct work_struct *work)
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 5a1ce64039f7..0749df80454d 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -696,7 +696,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
if (err)
return err;
- link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]);
+ link_info.dest = htonl(nla_get_flag(link[TIPC_NLA_LINK_DEST]));
link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP]));
nla_strscpy(link_info.str, link[TIPC_NLA_LINK_NAME],
TIPC_MAX_LINK_NAME);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 136338b85504..61c38eaaa298 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -372,42 +372,49 @@ static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id)
}
static void tipc_node_read_lock(struct tipc_node *n)
+ __acquires(n->lock)
{
read_lock_bh(&n->lock);
}
static void tipc_node_read_unlock(struct tipc_node *n)
+ __releases(n->lock)
{
read_unlock_bh(&n->lock);
}
static void tipc_node_write_lock(struct tipc_node *n)
+ __acquires(n->lock)
{
write_lock_bh(&n->lock);
}
static void tipc_node_write_unlock_fast(struct tipc_node *n)
+ __releases(n->lock)
{
write_unlock_bh(&n->lock);
}
static void tipc_node_write_unlock(struct tipc_node *n)
+ __releases(n->lock)
{
+ struct tipc_socket_addr sk;
struct net *net = n->net;
- u32 addr = 0;
u32 flags = n->action_flags;
- u32 link_id = 0;
- u32 bearer_id;
struct list_head *publ_list;
+ struct tipc_uaddr ua;
+ u32 bearer_id;
if (likely(!flags)) {
write_unlock_bh(&n->lock);
return;
}
- addr = n->addr;
- link_id = n->link_id;
- bearer_id = link_id & 0xffff;
+ tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE,
+ TIPC_LINK_STATE, n->addr, n->addr);
+ sk.ref = n->link_id;
+ sk.node = n->addr;
+ bearer_id = n->link_id & 0xffff;
publ_list = &n->publ_list;
n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
@@ -416,20 +423,18 @@ static void tipc_node_write_unlock(struct tipc_node *n)
write_unlock_bh(&n->lock);
if (flags & TIPC_NOTIFY_NODE_DOWN)
- tipc_publ_notify(net, publ_list, addr, n->capabilities);
+ tipc_publ_notify(net, publ_list, n->addr, n->capabilities);
if (flags & TIPC_NOTIFY_NODE_UP)
- tipc_named_node_up(net, addr, n->capabilities);
+ tipc_named_node_up(net, n->addr, n->capabilities);
if (flags & TIPC_NOTIFY_LINK_UP) {
- tipc_mon_peer_up(net, addr, bearer_id);
- tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
- TIPC_NODE_SCOPE, link_id, link_id);
+ tipc_mon_peer_up(net, n->addr, bearer_id);
+ tipc_nametbl_publish(net, &ua, &sk, n->link_id);
}
if (flags & TIPC_NOTIFY_LINK_DOWN) {
- tipc_mon_peer_down(net, addr, bearer_id);
- tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
- addr, link_id);
+ tipc_mon_peer_down(net, n->addr, bearer_id);
+ tipc_nametbl_withdraw(net, &ua, &sk, n->link_id);
}
}
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index cebcc104dc70..117a472a8e61 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3,7 +3,7 @@
*
* Copyright (c) 2001-2007, 2012-2019, Ericsson AB
* Copyright (c) 2004-2008, 2010-2013, Wind River Systems
- * Copyright (c) 2020, Red Hat Inc
+ * Copyright (c) 2020-2021, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -111,7 +111,6 @@ struct tipc_sock {
struct sock sk;
u32 conn_type;
u32 conn_instance;
- int published;
u32 max_pkt;
u32 maxnagle;
u32 portid;
@@ -141,6 +140,7 @@ struct tipc_sock {
bool expect_ack;
bool nodelay;
bool group_is_open;
+ bool published;
};
static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
@@ -151,10 +151,8 @@ static int tipc_release(struct socket *sock);
static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
bool kern);
static void tipc_sk_timeout(struct timer_list *t);
-static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
- struct tipc_service_range const *seq);
-static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
- struct tipc_service_range const *seq);
+static int tipc_sk_publish(struct tipc_sock *tsk, struct tipc_uaddr *ua);
+static int tipc_sk_withdraw(struct tipc_sock *tsk, struct tipc_uaddr *ua);
static int tipc_sk_leave(struct tipc_sock *tsk);
static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
static int tipc_sk_insert(struct tipc_sock *tsk);
@@ -644,7 +642,7 @@ static int tipc_release(struct socket *sock)
__tipc_shutdown(sock, TIPC_ERR_NO_PORT);
sk->sk_shutdown = SHUTDOWN_MASK;
tipc_sk_leave(tsk);
- tipc_sk_withdraw(tsk, 0, NULL);
+ tipc_sk_withdraw(tsk, NULL);
__skb_queue_purge(&tsk->mc_method.deferredq);
sk_stop_timer(sk, &sk->sk_timer);
tipc_sk_remove(tsk);
@@ -677,22 +675,31 @@ static int tipc_release(struct socket *sock)
*/
static int __tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
{
- struct sockaddr_tipc *addr = (struct sockaddr_tipc *)skaddr;
+ struct tipc_uaddr *ua = (struct tipc_uaddr *)skaddr;
struct tipc_sock *tsk = tipc_sk(sock->sk);
+ bool unbind = false;
if (unlikely(!alen))
- return tipc_sk_withdraw(tsk, 0, NULL);
+ return tipc_sk_withdraw(tsk, NULL);
- if (addr->addrtype == TIPC_SERVICE_ADDR)
- addr->addr.nameseq.upper = addr->addr.nameseq.lower;
+ if (ua->addrtype == TIPC_SERVICE_ADDR) {
+ ua->addrtype = TIPC_SERVICE_RANGE;
+ ua->sr.upper = ua->sr.lower;
+ }
+ if (ua->scope < 0) {
+ unbind = true;
+ ua->scope = -ua->scope;
+ }
+ /* Users may still use deprecated TIPC_ZONE_SCOPE */
+ if (ua->scope != TIPC_NODE_SCOPE)
+ ua->scope = TIPC_CLUSTER_SCOPE;
if (tsk->group)
return -EACCES;
- if (addr->scope >= 0)
- return tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq);
- else
- return tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq);
+ if (unbind)
+ return tipc_sk_withdraw(tsk, ua);
+ return tipc_sk_publish(tsk, ua);
}
int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
@@ -707,18 +714,17 @@ int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
{
- struct sockaddr_tipc *addr = (struct sockaddr_tipc *)skaddr;
+ struct tipc_uaddr *ua = (struct tipc_uaddr *)skaddr;
+ u32 atype = ua->addrtype;
if (alen) {
- if (alen < sizeof(struct sockaddr_tipc))
+ if (!tipc_uaddr_valid(ua, alen))
return -EINVAL;
- if (addr->family != AF_TIPC)
+ if (atype == TIPC_SOCKET_ADDR)
return -EAFNOSUPPORT;
- if (addr->addrtype > TIPC_SERVICE_ADDR)
- return -EAFNOSUPPORT;
- if (addr->addr.nameseq.type < TIPC_RESERVED_TYPES) {
+ if (ua->sr.type < TIPC_RESERVED_TYPES) {
pr_warn_once("Can't bind to reserved service type %u\n",
- addr->addr.nameseq.type);
+ ua->sr.type);
return -EACCES;
}
}
@@ -826,7 +832,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
/**
* tipc_sendmcast - send multicast message
* @sock: socket structure
- * @seq: destination address
+ * @ua: destination address struct
* @msg: message to send
* @dlen: length of data to send
* @timeout: timeout to wait for wakeup
@@ -834,7 +840,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
* Called from function tipc_sendmsg(), which has done all sanity checks
* Return: the number of bytes sent on success, or errno
*/
-static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq,
+static int tipc_sendmcast(struct socket *sock, struct tipc_uaddr *ua,
struct msghdr *msg, size_t dlen, long timeout)
{
struct sock *sk = sock->sk;
@@ -842,7 +848,6 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq,
struct tipc_msg *hdr = &tsk->phdr;
struct net *net = sock_net(sk);
int mtu = tipc_bcast_get_mtu(net);
- struct tipc_mc_method *method = &tsk->mc_method;
struct sk_buff_head pkts;
struct tipc_nlist dsts;
int rc;
@@ -857,8 +862,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq,
/* Lookup destination nodes */
tipc_nlist_init(&dsts, tipc_own_addr(net));
- tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower,
- seq->upper, &dsts);
+ tipc_nametbl_lookup_mcast_nodes(net, ua, &dsts);
if (!dsts.local && !dsts.remote)
return -EHOSTUNREACH;
@@ -868,9 +872,9 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq,
msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE);
msg_set_destport(hdr, 0);
msg_set_destnode(hdr, 0);
- msg_set_nametype(hdr, seq->type);
- msg_set_namelower(hdr, seq->lower);
- msg_set_nameupper(hdr, seq->upper);
+ msg_set_nametype(hdr, ua->sr.type);
+ msg_set_namelower(hdr, ua->sr.lower);
+ msg_set_nameupper(hdr, ua->sr.upper);
/* Build message as chain of buffers */
__skb_queue_head_init(&pkts);
@@ -880,7 +884,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq,
if (unlikely(rc == dlen)) {
trace_tipc_sk_sendmcast(sk, skb_peek(&pkts),
TIPC_DUMP_SK_SNDQ, " ");
- rc = tipc_mcast_xmit(net, &pkts, method, &dsts,
+ rc = tipc_mcast_xmit(net, &pkts, &tsk->mc_method, &dsts,
&tsk->cong_link_cnt);
}
@@ -954,7 +958,7 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m,
int dlen, long timeout)
{
struct sock *sk = sock->sk;
- DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
+ struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name;
int blks = tsk_blocks(GROUP_H_SIZE + dlen);
struct tipc_sock *tsk = tipc_sk(sk);
struct net *net = sock_net(sk);
@@ -962,8 +966,8 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m,
u32 node, port;
int rc;
- node = dest->addr.id.node;
- port = dest->addr.id.ref;
+ node = ua->sk.node;
+ port = ua->sk.ref;
if (!port && !node)
return -EHOSTUNREACH;
@@ -997,7 +1001,7 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m,
static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
int dlen, long timeout)
{
- DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
+ struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name;
struct sock *sk = sock->sk;
struct tipc_sock *tsk = tipc_sk(sk);
struct list_head *cong_links = &tsk->cong_links;
@@ -1008,16 +1012,13 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
struct net *net = sock_net(sk);
u32 node, port, exclude;
struct list_head dsts;
- u32 type, inst, scope;
int lookups = 0;
int dstcnt, rc;
bool cong;
INIT_LIST_HEAD(&dsts);
-
- type = msg_nametype(hdr);
- inst = dest->addr.name.name.instance;
- scope = msg_lookup_scope(hdr);
+ ua->sa.type = msg_nametype(hdr);
+ ua->scope = msg_lookup_scope(hdr);
while (++lookups < 4) {
exclude = tipc_group_exclude(tsk->group);
@@ -1026,8 +1027,8 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
/* Look for a non-congested destination member, if any */
while (1) {
- if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts,
- &dstcnt, exclude, false))
+ if (!tipc_nametbl_lookup_group(net, ua, &dsts, &dstcnt,
+ exclude, false))
return -EHOSTUNREACH;
tipc_dest_pop(&dsts, &node, &port);
cong = tipc_group_cong(tsk->group, node, port, blks,
@@ -1082,7 +1083,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
int dlen, long timeout)
{
- DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
+ struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name;
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct tipc_sock *tsk = tipc_sk(sk);
@@ -1107,9 +1108,9 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
return -EHOSTUNREACH;
/* Complete message header */
- if (dest) {
+ if (ua) {
msg_set_type(hdr, TIPC_GRP_MCAST_MSG);
- msg_set_nameinst(hdr, dest->addr.name.name.instance);
+ msg_set_nameinst(hdr, ua->sa.instance);
} else {
msg_set_type(hdr, TIPC_GRP_BCAST_MSG);
msg_set_nameinst(hdr, 0);
@@ -1156,29 +1157,25 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m,
int dlen, long timeout)
{
+ struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name;
struct sock *sk = sock->sk;
- DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
struct tipc_sock *tsk = tipc_sk(sk);
struct tipc_group *grp = tsk->group;
struct tipc_msg *hdr = &tsk->phdr;
struct net *net = sock_net(sk);
- u32 type, inst, scope, exclude;
struct list_head dsts;
- u32 dstcnt;
+ u32 dstcnt, exclude;
INIT_LIST_HEAD(&dsts);
-
- type = msg_nametype(hdr);
- inst = dest->addr.name.name.instance;
- scope = msg_lookup_scope(hdr);
+ ua->sa.type = msg_nametype(hdr);
+ ua->scope = msg_lookup_scope(hdr);
exclude = tipc_group_exclude(grp);
- if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts,
- &dstcnt, exclude, true))
+ if (!tipc_nametbl_lookup_group(net, ua, &dsts, &dstcnt, exclude, true))
return -EHOSTUNREACH;
if (dstcnt == 1) {
- tipc_dest_pop(&dsts, &dest->addr.id.node, &dest->addr.id.ref);
+ tipc_dest_pop(&dsts, &ua->sk.node, &ua->sk.ref);
return tipc_send_group_unicast(sock, m, dlen, timeout);
}
@@ -1198,17 +1195,18 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
struct sk_buff_head *inputq)
{
u32 self = tipc_own_addr(net);
- u32 type, lower, upper, scope;
struct sk_buff *skb, *_skb;
u32 portid, onode;
struct sk_buff_head tmpq;
struct list_head dports;
struct tipc_msg *hdr;
+ struct tipc_uaddr ua;
int user, mtyp, hlen;
bool exact;
__skb_queue_head_init(&tmpq);
INIT_LIST_HEAD(&dports);
+ ua.addrtype = TIPC_SERVICE_RANGE;
skb = tipc_skb_peek(arrvq, &inputq->lock);
for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
@@ -1217,7 +1215,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
mtyp = msg_type(hdr);
hlen = skb_headroom(skb) + msg_hdr_sz(hdr);
onode = msg_orignode(hdr);
- type = msg_nametype(hdr);
+ ua.sr.type = msg_nametype(hdr);
if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) {
spin_lock_bh(&inputq->lock);
@@ -1232,24 +1230,23 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
/* Group messages require exact scope match */
if (msg_in_group(hdr)) {
- lower = 0;
- upper = ~0;
- scope = msg_lookup_scope(hdr);
+ ua.sr.lower = 0;
+ ua.sr.upper = ~0;
+ ua.scope = msg_lookup_scope(hdr);
exact = true;
} else {
/* TIPC_NODE_SCOPE means "any scope" in this context */
if (onode == self)
- scope = TIPC_NODE_SCOPE;
+ ua.scope = TIPC_NODE_SCOPE;
else
- scope = TIPC_CLUSTER_SCOPE;
+ ua.scope = TIPC_CLUSTER_SCOPE;
exact = false;
- lower = msg_namelower(hdr);
- upper = msg_nameupper(hdr);
+ ua.sr.lower = msg_namelower(hdr);
+ ua.sr.upper = msg_nameupper(hdr);
}
/* Create destination port list: */
- tipc_nametbl_mc_lookup(net, type, lower, upper,
- scope, exact, &dports);
+ tipc_nametbl_lookup_mcast_sockets(net, &ua, exact, &dports);
/* Clone message per destination */
while (tipc_dest_pop(&dports, NULL, &portid)) {
@@ -1417,44 +1414,43 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct tipc_sock *tsk = tipc_sk(sk);
- DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
+ struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name;
long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
struct list_head *clinks = &tsk->cong_links;
bool syn = !tipc_sk_type_connectionless(sk);
struct tipc_group *grp = tsk->group;
struct tipc_msg *hdr = &tsk->phdr;
- struct tipc_service_range *seq;
+ struct tipc_socket_addr skaddr;
struct sk_buff_head pkts;
- u32 dport = 0, dnode = 0;
- u32 type = 0, inst = 0;
- int mtu, rc;
+ int atype, mtu, rc;
if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
return -EMSGSIZE;
- if (likely(dest)) {
- if (unlikely(m->msg_namelen < sizeof(*dest)))
- return -EINVAL;
- if (unlikely(dest->family != AF_TIPC))
+ if (ua) {
+ if (!tipc_uaddr_valid(ua, m->msg_namelen))
return -EINVAL;
+ atype = ua->addrtype;
}
+ /* If socket belongs to a communication group follow other paths */
if (grp) {
- if (!dest)
+ if (!ua)
return tipc_send_group_bcast(sock, m, dlen, timeout);
- if (dest->addrtype == TIPC_SERVICE_ADDR)
+ if (atype == TIPC_SERVICE_ADDR)
return tipc_send_group_anycast(sock, m, dlen, timeout);
- if (dest->addrtype == TIPC_SOCKET_ADDR)
+ if (atype == TIPC_SOCKET_ADDR)
return tipc_send_group_unicast(sock, m, dlen, timeout);
- if (dest->addrtype == TIPC_ADDR_MCAST)
+ if (atype == TIPC_SERVICE_RANGE)
return tipc_send_group_mcast(sock, m, dlen, timeout);
return -EINVAL;
}
- if (unlikely(!dest)) {
- dest = &tsk->peer;
- if (!syn && dest->family != AF_TIPC)
+ if (!ua) {
+ ua = (struct tipc_uaddr *)&tsk->peer;
+ if (!syn && ua->family != AF_TIPC)
return -EDESTADDRREQ;
+ atype = ua->addrtype;
}
if (unlikely(syn)) {
@@ -1464,54 +1460,51 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
return -EISCONN;
if (tsk->published)
return -EOPNOTSUPP;
- if (dest->addrtype == TIPC_SERVICE_ADDR) {
- tsk->conn_type = dest->addr.name.name.type;
- tsk->conn_instance = dest->addr.name.name.instance;
+ if (atype == TIPC_SERVICE_ADDR) {
+ tsk->conn_type = ua->sa.type;
+ tsk->conn_instance = ua->sa.instance;
}
msg_set_syn(hdr, 1);
}
- seq = &dest->addr.nameseq;
- if (dest->addrtype == TIPC_ADDR_MCAST)
- return tipc_sendmcast(sock, seq, m, dlen, timeout);
-
- if (dest->addrtype == TIPC_SERVICE_ADDR) {
- type = dest->addr.name.name.type;
- inst = dest->addr.name.name.instance;
- dnode = dest->addr.name.domain;
- dport = tipc_nametbl_translate(net, type, inst, &dnode);
- if (unlikely(!dport && !dnode))
+ /* Determine destination */
+ if (atype == TIPC_SERVICE_RANGE) {
+ return tipc_sendmcast(sock, ua, m, dlen, timeout);
+ } else if (atype == TIPC_SERVICE_ADDR) {
+ skaddr.node = ua->lookup_node;
+ ua->scope = tipc_node2scope(skaddr.node);
+ if (!tipc_nametbl_lookup_anycast(net, ua, &skaddr))
return -EHOSTUNREACH;
- } else if (dest->addrtype == TIPC_SOCKET_ADDR) {
- dnode = dest->addr.id.node;
+ } else if (atype == TIPC_SOCKET_ADDR) {
+ skaddr = ua->sk;
} else {
return -EINVAL;
}
/* Block or return if destination link is congested */
rc = tipc_wait_for_cond(sock, &timeout,
- !tipc_dest_find(clinks, dnode, 0));
+ !tipc_dest_find(clinks, skaddr.node, 0));
if (unlikely(rc))
return rc;
- if (dest->addrtype == TIPC_SERVICE_ADDR) {
+ /* Finally build message header */
+ msg_set_destnode(hdr, skaddr.node);
+ msg_set_destport(hdr, skaddr.ref);
+ if (atype == TIPC_SERVICE_ADDR) {
msg_set_type(hdr, TIPC_NAMED_MSG);
msg_set_hdr_sz(hdr, NAMED_H_SIZE);
- msg_set_nametype(hdr, type);
- msg_set_nameinst(hdr, inst);
- msg_set_lookup_scope(hdr, tipc_node2scope(dnode));
- msg_set_destnode(hdr, dnode);
- msg_set_destport(hdr, dport);
+ msg_set_nametype(hdr, ua->sa.type);
+ msg_set_nameinst(hdr, ua->sa.instance);
+ msg_set_lookup_scope(hdr, ua->scope);
} else { /* TIPC_SOCKET_ADDR */
msg_set_type(hdr, TIPC_DIRECT_MSG);
msg_set_lookup_scope(hdr, 0);
- msg_set_destnode(hdr, dnode);
- msg_set_destport(hdr, dest->addr.id.ref);
msg_set_hdr_sz(hdr, BASIC_H_SIZE);
}
+ /* Add message body */
__skb_queue_head_init(&pkts);
- mtu = tipc_node_get_mtu(net, dnode, tsk->portid, true);
+ mtu = tipc_node_get_mtu(net, skaddr.node, tsk->portid, true);
rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
if (unlikely(rc != dlen))
return rc;
@@ -1520,10 +1513,11 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
return -ENOMEM;
}
+ /* Send message */
trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " ");
- rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
+ rc = tipc_node_xmit(net, &pkts, skaddr.node, tsk->portid);
if (unlikely(rc == -ELINKCONG)) {
- tipc_dest_push(clinks, dnode, 0);
+ tipc_dest_push(clinks, skaddr.node, 0);
tsk->cong_link_cnt++;
rc = 0;
}
@@ -2891,66 +2885,62 @@ static void tipc_sk_timeout(struct timer_list *t)
sock_put(sk);
}
-static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
- struct tipc_service_range const *seq)
+static int tipc_sk_publish(struct tipc_sock *tsk, struct tipc_uaddr *ua)
{
struct sock *sk = &tsk->sk;
struct net *net = sock_net(sk);
- struct publication *publ;
+ struct tipc_socket_addr skaddr;
+ struct publication *p;
u32 key;
- if (scope != TIPC_NODE_SCOPE)
- scope = TIPC_CLUSTER_SCOPE;
-
if (tipc_sk_connected(sk))
return -EINVAL;
key = tsk->portid + tsk->pub_count + 1;
if (key == tsk->portid)
return -EADDRINUSE;
-
- publ = tipc_nametbl_publish(net, seq->type, seq->lower, seq->upper,
- scope, tsk->portid, key);
- if (unlikely(!publ))
+ skaddr.ref = tsk->portid;
+ skaddr.node = tipc_own_addr(net);
+ p = tipc_nametbl_publish(net, ua, &skaddr, key);
+ if (unlikely(!p))
return -EINVAL;
- list_add(&publ->binding_sock, &tsk->publications);
+ list_add(&p->binding_sock, &tsk->publications);
tsk->pub_count++;
- tsk->published = 1;
+ tsk->published = true;
return 0;
}
-static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
- struct tipc_service_range const *seq)
+static int tipc_sk_withdraw(struct tipc_sock *tsk, struct tipc_uaddr *ua)
{
struct net *net = sock_net(&tsk->sk);
- struct publication *publ;
- struct publication *safe;
+ struct publication *safe, *p;
+ struct tipc_uaddr _ua;
int rc = -EINVAL;
- if (scope != TIPC_NODE_SCOPE)
- scope = TIPC_CLUSTER_SCOPE;
-
- list_for_each_entry_safe(publ, safe, &tsk->publications, binding_sock) {
- if (seq) {
- if (publ->scope != scope)
- continue;
- if (publ->type != seq->type)
- continue;
- if (publ->lower != seq->lower)
- continue;
- if (publ->upper != seq->upper)
- break;
- tipc_nametbl_withdraw(net, publ->type, publ->lower,
- publ->upper, publ->key);
- rc = 0;
- break;
+ list_for_each_entry_safe(p, safe, &tsk->publications, binding_sock) {
+ if (!ua) {
+ tipc_uaddr(&_ua, TIPC_SERVICE_RANGE, p->scope,
+ p->sr.type, p->sr.lower, p->sr.upper);
+ tipc_nametbl_withdraw(net, &_ua, &p->sk, p->key);
+ continue;
}
- tipc_nametbl_withdraw(net, publ->type, publ->lower,
- publ->upper, publ->key);
+ /* Unbind specific publication */
+ if (p->scope != ua->scope)
+ continue;
+ if (p->sr.type != ua->sr.type)
+ continue;
+ if (p->sr.lower != ua->sr.lower)
+ continue;
+ if (p->sr.upper != ua->sr.upper)
+ break;
+ tipc_nametbl_withdraw(net, ua, &p->sk, p->key);
rc = 0;
+ break;
}
- if (list_empty(&tsk->publications))
+ if (list_empty(&tsk->publications)) {
tsk->published = 0;
+ rc = 0;
+ }
return rc;
}
@@ -3067,13 +3057,15 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
struct net *net = sock_net(&tsk->sk);
struct tipc_group *grp = tsk->group;
struct tipc_msg *hdr = &tsk->phdr;
- struct tipc_service_range seq;
+ struct tipc_uaddr ua;
int rc;
if (mreq->type < TIPC_RESERVED_TYPES)
return -EACCES;
if (mreq->scope > TIPC_NODE_SCOPE)
return -EINVAL;
+ if (mreq->scope != TIPC_NODE_SCOPE)
+ mreq->scope = TIPC_CLUSTER_SCOPE;
if (grp)
return -EACCES;
grp = tipc_group_create(net, tsk->portid, mreq, &tsk->group_is_open);
@@ -3083,11 +3075,10 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
msg_set_lookup_scope(hdr, mreq->scope);
msg_set_nametype(hdr, mreq->type);
msg_set_dest_droppable(hdr, true);
- seq.type = mreq->type;
- seq.lower = mreq->instance;
- seq.upper = seq.lower;
- tipc_nametbl_build_group(net, grp, mreq->type, mreq->scope);
- rc = tipc_sk_publish(tsk, mreq->scope, &seq);
+ tipc_uaddr(&ua, TIPC_SERVICE_RANGE, mreq->scope,
+ mreq->type, mreq->instance, mreq->instance);
+ tipc_nametbl_build_group(net, grp, &ua);
+ rc = tipc_sk_publish(tsk, &ua);
if (rc) {
tipc_group_delete(net, grp);
tsk->group = NULL;
@@ -3104,15 +3095,17 @@ static int tipc_sk_leave(struct tipc_sock *tsk)
{
struct net *net = sock_net(&tsk->sk);
struct tipc_group *grp = tsk->group;
- struct tipc_service_range seq;
+ struct tipc_uaddr ua;
int scope;
if (!grp)
return -EINVAL;
- tipc_group_self(grp, &seq, &scope);
+ ua.addrtype = TIPC_SERVICE_RANGE;
+ tipc_group_self(grp, &ua.sr, &scope);
+ ua.scope = scope;
tipc_group_delete(net, grp);
tsk->group = NULL;
- tipc_sk_withdraw(tsk, scope, &seq);
+ tipc_sk_withdraw(tsk, &ua);
return 0;
}
@@ -3711,11 +3704,11 @@ static int __tipc_nl_add_sk_publ(struct sk_buff *skb,
if (nla_put_u32(skb, TIPC_NLA_PUBL_KEY, publ->key))
goto attr_msg_cancel;
- if (nla_put_u32(skb, TIPC_NLA_PUBL_TYPE, publ->type))
+ if (nla_put_u32(skb, TIPC_NLA_PUBL_TYPE, publ->sr.type))
goto attr_msg_cancel;
- if (nla_put_u32(skb, TIPC_NLA_PUBL_LOWER, publ->lower))
+ if (nla_put_u32(skb, TIPC_NLA_PUBL_LOWER, publ->sr.lower))
goto attr_msg_cancel;
- if (nla_put_u32(skb, TIPC_NLA_PUBL_UPPER, publ->upper))
+ if (nla_put_u32(skb, TIPC_NLA_PUBL_UPPER, publ->sr.upper))
goto attr_msg_cancel;
nla_nest_end(skb, attrs);
@@ -3863,9 +3856,9 @@ bool tipc_sk_filtering(struct sock *sk)
p = list_first_entry_or_null(&tsk->publications,
struct publication, binding_sock);
if (p) {
- type = p->type;
- lower = p->lower;
- upper = p->upper;
+ type = p->sr.type;
+ lower = p->sr.lower;
+ upper = p->sr.upper;
}
}
@@ -3964,9 +3957,9 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf)
if (tsk->published) {
p = list_first_entry_or_null(&tsk->publications,
struct publication, binding_sock);
- i += scnprintf(buf + i, sz - i, " %u", (p) ? p->type : 0);
- i += scnprintf(buf + i, sz - i, " %u", (p) ? p->lower : 0);
- i += scnprintf(buf + i, sz - i, " %u", (p) ? p->upper : 0);
+ i += scnprintf(buf + i, sz - i, " %u", (p) ? p->sr.type : 0);
+ i += scnprintf(buf + i, sz - i, " %u", (p) ? p->sr.lower : 0);
+ i += scnprintf(buf + i, sz - i, " %u", (p) ? p->sr.upper : 0);
}
i += scnprintf(buf + i, sz - i, " | %u", tsk->snd_win);
i += scnprintf(buf + i, sz - i, " %u", tsk->rcv_win);
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index f6ad0005218c..8e00d739f03a 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -3,7 +3,7 @@
*
* Copyright (c) 2000-2017, Ericsson AB
* Copyright (c) 2005-2007, 2010-2013, Wind River Systems
- * Copyright (c) 2020, Red Hat Inc
+ * Copyright (c) 2020-2021, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -40,77 +40,75 @@
#include "subscr.h"
static void tipc_sub_send_event(struct tipc_subscription *sub,
- u32 found_lower, u32 found_upper,
- u32 event, u32 port, u32 node)
+ struct publication *p,
+ u32 event)
{
+ struct tipc_subscr *s = &sub->evt.s;
struct tipc_event *evt = &sub->evt;
if (sub->inactive)
return;
tipc_evt_write(evt, event, event);
- tipc_evt_write(evt, found_lower, found_lower);
- tipc_evt_write(evt, found_upper, found_upper);
- tipc_evt_write(evt, port.ref, port);
- tipc_evt_write(evt, port.node, node);
+ if (p) {
+ tipc_evt_write(evt, found_lower, p->sr.lower);
+ tipc_evt_write(evt, found_upper, p->sr.upper);
+ tipc_evt_write(evt, port.ref, p->sk.ref);
+ tipc_evt_write(evt, port.node, p->sk.node);
+ } else {
+ tipc_evt_write(evt, found_lower, s->seq.lower);
+ tipc_evt_write(evt, found_upper, s->seq.upper);
+ tipc_evt_write(evt, port.ref, 0);
+ tipc_evt_write(evt, port.node, 0);
+ }
tipc_topsrv_queue_evt(sub->net, sub->conid, event, evt);
}
/**
* tipc_sub_check_overlap - test for subscription overlap with the given values
- * @seq: tipc_name_seq to check
- * @found_lower: lower value to test
- * @found_upper: upper value to test
+ * @subscribed: the service range subscribed for
+ * @found: the service range we are checning for match
*
- * Return: 1 if there is overlap, otherwise 0.
+ * Returns true if there is overlap, otherwise false.
*/
-int tipc_sub_check_overlap(struct tipc_service_range *seq, u32 found_lower,
- u32 found_upper)
+static bool tipc_sub_check_overlap(struct tipc_service_range *subscribed,
+ struct tipc_service_range *found)
{
- if (found_lower < seq->lower)
- found_lower = seq->lower;
- if (found_upper > seq->upper)
- found_upper = seq->upper;
- if (found_lower > found_upper)
- return 0;
- return 1;
+ u32 found_lower = found->lower;
+ u32 found_upper = found->upper;
+
+ if (found_lower < subscribed->lower)
+ found_lower = subscribed->lower;
+ if (found_upper > subscribed->upper)
+ found_upper = subscribed->upper;
+ return found_lower <= found_upper;
}
void tipc_sub_report_overlap(struct tipc_subscription *sub,
- u32 found_lower, u32 found_upper,
- u32 event, u32 port, u32 node,
- u32 scope, int must)
+ struct publication *p,
+ u32 event, bool must)
{
- struct tipc_subscr *s = &sub->evt.s;
- u32 filter = tipc_sub_read(s, filter);
- struct tipc_service_range seq;
+ struct tipc_service_range *sr = &sub->s.seq;
+ u32 filter = sub->s.filter;
- seq.type = tipc_sub_read(s, seq.type);
- seq.lower = tipc_sub_read(s, seq.lower);
- seq.upper = tipc_sub_read(s, seq.upper);
-
- if (!tipc_sub_check_overlap(&seq, found_lower, found_upper))
+ if (!tipc_sub_check_overlap(sr, &p->sr))
return;
-
if (!must && !(filter & TIPC_SUB_PORTS))
return;
- if (filter & TIPC_SUB_CLUSTER_SCOPE && scope == TIPC_NODE_SCOPE)
+ if (filter & TIPC_SUB_CLUSTER_SCOPE && p->scope == TIPC_NODE_SCOPE)
return;
- if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE)
+ if (filter & TIPC_SUB_NODE_SCOPE && p->scope != TIPC_NODE_SCOPE)
return;
spin_lock(&sub->lock);
- tipc_sub_send_event(sub, found_lower, found_upper,
- event, port, node);
+ tipc_sub_send_event(sub, p, event);
spin_unlock(&sub->lock);
}
static void tipc_sub_timeout(struct timer_list *t)
{
struct tipc_subscription *sub = from_timer(sub, t, timer);
- struct tipc_subscr *s = &sub->evt.s;
spin_lock(&sub->lock);
- tipc_sub_send_event(sub, s->seq.lower, s->seq.upper,
- TIPC_SUBSCR_TIMEOUT, 0, 0);
+ tipc_sub_send_event(sub, NULL, TIPC_SUBSCR_TIMEOUT);
sub->inactive = true;
spin_unlock(&sub->lock);
}
@@ -134,12 +132,14 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net,
struct tipc_subscr *s,
int conid)
{
+ u32 lower = tipc_sub_read(s, seq.lower);
+ u32 upper = tipc_sub_read(s, seq.upper);
u32 filter = tipc_sub_read(s, filter);
struct tipc_subscription *sub;
u32 timeout;
if ((filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) ||
- (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper))) {
+ lower > upper) {
pr_warn("Subscription rejected, illegal request\n");
return NULL;
}
@@ -154,6 +154,12 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net,
sub->conid = conid;
sub->inactive = false;
memcpy(&sub->evt.s, s, sizeof(*s));
+ sub->s.seq.type = tipc_sub_read(s, seq.type);
+ sub->s.seq.lower = lower;
+ sub->s.seq.upper = upper;
+ sub->s.filter = filter;
+ sub->s.timeout = tipc_sub_read(s, timeout);
+ memcpy(sub->s.usr_handle, s->usr_handle, 8);
spin_lock_init(&sub->lock);
kref_init(&sub->kref);
if (!tipc_nametbl_subscribe(sub)) {
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 3ded27391d54..ddea6554ec46 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -3,7 +3,7 @@
*
* Copyright (c) 2003-2017, Ericsson AB
* Copyright (c) 2005-2007, 2012-2013, Wind River Systems
- * Copyright (c) 2020, Red Hat Inc
+ * Copyright (c) 2020-2021, Red Hat Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,7 @@
#define TIPC_MAX_SUBSCR 65535
#define TIPC_MAX_PUBL 65535
+struct publication;
struct tipc_subscription;
struct tipc_conn;
@@ -59,12 +60,13 @@ struct tipc_conn;
* @lock: serialize up/down and timer events
*/
struct tipc_subscription {
+ struct tipc_subscr s;
+ struct tipc_event evt;
struct kref kref;
struct net *net;
struct timer_list timer;
struct list_head service_list;
struct list_head sub_list;
- struct tipc_event evt;
int conid;
bool inactive;
spinlock_t lock;
@@ -74,13 +76,9 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net,
struct tipc_subscr *s,
int conid);
void tipc_sub_unsubscribe(struct tipc_subscription *sub);
-
-int tipc_sub_check_overlap(struct tipc_service_range *seq,
- u32 found_lower, u32 found_upper);
void tipc_sub_report_overlap(struct tipc_subscription *sub,
- u32 found_lower, u32 found_upper,
- u32 event, u32 port, u32 node,
- u32 scope, int must);
+ struct publication *p,
+ u32 event, bool must);
int __net_init tipc_topsrv_init_net(struct net *net);
void __net_exit tipc_topsrv_exit_net(struct net *net);
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index d9cd229aa111..790c6b7ecb26 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -601,7 +601,7 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
if (!info ||
before(seq, info->end_seq - info->len)) {
/* if retransmit_hint is irrelevant start
- * from the beggining of the list
+ * from the beginning of the list
*/
info = list_first_entry_or_null(&context->records_list,
struct tls_record_info, list);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4faabd1ecfd1..a71ed664da0a 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -445,6 +445,97 @@ static void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
+static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
+ struct xdp_desc *desc)
+{
+ struct xsk_buff_pool *pool = xs->pool;
+ u32 hr, len, ts, offset, copy, copied;
+ struct sk_buff *skb;
+ struct page *page;
+ void *buffer;
+ int err, i;
+ u64 addr;
+
+ hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
+
+ skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
+ if (unlikely(!skb))
+ return ERR_PTR(err);
+
+ skb_reserve(skb, hr);
+
+ addr = desc->addr;
+ len = desc->len;
+ ts = pool->unaligned ? len : pool->chunk_size;
+
+ buffer = xsk_buff_raw_get_data(pool, addr);
+ offset = offset_in_page(buffer);
+ addr = buffer - pool->addrs;
+
+ for (copied = 0, i = 0; copied < len; i++) {
+ page = pool->umem->pgs[addr >> PAGE_SHIFT];
+ get_page(page);
+
+ copy = min_t(u32, PAGE_SIZE - offset, len - copied);
+ skb_fill_page_desc(skb, i, page, offset, copy);
+
+ copied += copy;
+ addr += copy;
+ offset = 0;
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ skb->truesize += ts;
+
+ refcount_add(ts, &xs->sk.sk_wmem_alloc);
+
+ return skb;
+}
+
+static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
+ struct xdp_desc *desc)
+{
+ struct net_device *dev = xs->dev;
+ struct sk_buff *skb;
+
+ if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
+ skb = xsk_build_skb_zerocopy(xs, desc);
+ if (IS_ERR(skb))
+ return skb;
+ } else {
+ u32 hr, tr, len;
+ void *buffer;
+ int err;
+
+ hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
+ tr = dev->needed_tailroom;
+ len = desc->len;
+
+ skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
+ if (unlikely(!skb))
+ return ERR_PTR(err);
+
+ skb_reserve(skb, hr);
+ skb_put(skb, len);
+
+ buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
+ err = skb_store_bits(skb, 0, buffer, len);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+ }
+
+ skb->dev = dev;
+ skb->priority = xs->sk.sk_priority;
+ skb->mark = xs->sk.sk_mark;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
+ skb->destructor = xsk_destruct_skb;
+
+ return skb;
+}
+
static int xsk_generic_xmit(struct sock *sk)
{
struct xdp_sock *xs = xdp_sk(sk);
@@ -461,43 +552,30 @@ static int xsk_generic_xmit(struct sock *sk)
goto out;
while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
- char *buffer;
- u64 addr;
- u32 len;
-
if (max_batch-- == 0) {
err = -EAGAIN;
goto out;
}
- len = desc.len;
- skb = sock_alloc_send_skb(sk, len, 1, &err);
- if (unlikely(!skb))
+ skb = xsk_build_skb(xs, &desc);
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
goto out;
+ }
- skb_put(skb, len);
- addr = desc.addr;
- buffer = xsk_buff_raw_get_data(xs->pool, addr);
- err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
spin_lock_irqsave(&xs->pool->cq_lock, flags);
- if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+ if (xskq_prod_reserve(xs->pool->cq)) {
spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
kfree_skb(skb);
goto out;
}
spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
- skb->dev = xs->dev;
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
- skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
- skb->destructor = xsk_destruct_skb;
-
err = __dev_direct_xmit(skb, xs->queue_id);
if (err == NETDEV_TX_BUSY) {
/* Tell user-space to retry the send */
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 2823b7c3302d..2ac3802c2cd7 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -47,19 +47,18 @@ struct xsk_queue {
u64 queue_empty_descs;
};
-/* The structure of the shared state of the rings are the same as the
- * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion
- * ring, the kernel is the producer and user space is the consumer. For
- * the Tx and fill rings, the kernel is the consumer and user space is
- * the producer.
+/* The structure of the shared state of the rings are a simple
+ * circular buffer, as outlined in
+ * Documentation/core-api/circular-buffers.rst. For the Rx and
+ * completion ring, the kernel is the producer and user space is the
+ * consumer. For the Tx and fill rings, the kernel is the consumer and
+ * user space is the producer.
*
* producer consumer
*
- * if (LOAD ->consumer) { LOAD ->producer
- * (A) smp_rmb() (C)
+ * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C)
* STORE $data LOAD $data
- * smp_wmb() (B) smp_mb() (D)
- * STORE ->producer STORE ->consumer
+ * STORE.rel ->producer (B) STORE.rel ->consumer (D)
* }
*
* (A) pairs with (D), and (B) pairs with (C).
@@ -78,7 +77,8 @@ struct xsk_queue {
*
* (A) is a control dependency that separates the load of ->consumer
* from the stores of $data. In case ->consumer indicates there is no
- * room in the buffer to store $data we do not. So no barrier is needed.
+ * room in the buffer to store $data we do not. The dependency will
+ * order both of the stores after the loads. So no barrier is needed.
*
* (D) protects the load of the data to be observed to happen after the
* store of the consumer pointer. If we did not have this memory
@@ -227,15 +227,13 @@ static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q,
static inline void __xskq_cons_release(struct xsk_queue *q)
{
- smp_mb(); /* D, matches A */
- WRITE_ONCE(q->ring->consumer, q->cached_cons);
+ smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */
}
static inline void __xskq_cons_peek(struct xsk_queue *q)
{
/* Refresh the local pointer */
- q->cached_prod = READ_ONCE(q->ring->producer);
- smp_rmb(); /* C, matches B */
+ q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */
}
static inline void xskq_cons_get_entries(struct xsk_queue *q)
@@ -397,9 +395,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx)
{
- smp_wmb(); /* B, matches C */
-
- WRITE_ONCE(q->ring->producer, idx);
+ smp_store_release(&q->ring->producer, idx); /* B, matches C */
}
static inline void xskq_prod_submit(struct xsk_queue *q)
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 113fd9017203..67b4ce504852 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -87,7 +87,6 @@ static void xsk_map_free(struct bpf_map *map)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
- bpf_clear_redirect_map(map);
synchronize_net();
bpf_map_area_free(m);
}
@@ -125,6 +124,16 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
return insn - insn_buf;
}
+static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ return READ_ONCE(m->xsk_map[key]);
+}
+
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
{
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -215,6 +224,11 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
+static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem);
+}
+
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry)
{
@@ -247,4 +261,5 @@ const struct bpf_map_ops xsk_map_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "xsk_map",
.map_btf_id = &xsk_map_btf_id,
+ .map_redirect = xsk_map_redirect,
};