From 67cc570edaa02016a8685a06a0ee91f05a6277d9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 27 Aug 2020 19:28:42 +0200 Subject: netfilter: nf_tables: coalesce multiple notifications into one skbuff On x86_64, each notification results in one skbuff allocation which consumes at least 768 bytes due to the skbuff overhead. This patch coalesces several notifications into one single skbuff, so each notification consumes at least ~211 bytes, that ~3.5 times less memory consumption. As a result, this is reducing the chances to exhaust the netlink socket receive buffer. Rule of thumb is that each notification batch only contains netlink messages whose report flag is the same, nfnetlink_send() requires this to do appropriate delivery to userspace, either via unicast (echo mode) or multicast (monitor mode). The skbuff control buffer is used to annotate the report flag for later handling at the new coalescing routine. The batch skbuff notification size is NLMSG_GOODSIZE, using a larger skbuff would allow for more socket receiver buffer savings (to amortize the cost of the skbuff even more), however, going over that size might break userspace applications, so let's be conservative and stick to NLMSG_GOODSIZE. Reported-by: Phil Sutter Acked-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netns/nftables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index a1a8d45adb42..6c0806bd8d1e 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -8,6 +8,7 @@ struct netns_nftables { struct list_head tables; struct list_head commit_list; struct list_head module_list; + struct list_head notify_list; struct mutex commit_mutex; unsigned int base_seq; u8 gencursor; -- cgit v1.2.3 From 553d87b658fed0e22a0f86b4f1b093c39d3e3074 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 10 Sep 2020 15:34:39 +0200 Subject: netlink: fix doc about nlmsg_parse/nla_validate There is no @validate argument. CC: Johannes Berg Fixes: 3de644035446 ("netlink: re-add parse/validate functions in strict mode") Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/netlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index c0411f14fb53..8e0eb2c9c528 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -726,7 +726,6 @@ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected - * @validate: validation strictness * @extack: extended ACK report struct * * See nla_parse() @@ -824,7 +823,6 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len, * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy - * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the -- cgit v1.2.3 From 1869e226a7b3ef75b4f70ede2f1b7229f7157fa4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 13 Sep 2020 12:43:39 -0600 Subject: ipv4: Initialize flowi4_multipath_hash in data path flowi4_multipath_hash was added by the commit referenced below for tunnels. Unfortunately, the patch did not initialize the new field for several fast path lookups that do not initialize the entire flow struct to 0. Fix those locations. Currently, flowi4_multipath_hash is random garbage and affects the hash value computed by fib_multipath_hash for multipath selection. Fixes: 24ba14406c5c ("route: Add multipath_hash in flowi_common to make user-define hash") Signed-off-by: David Ahern Cc: wenxu Signed-off-by: David S. Miller --- include/net/flow.h | 1 + net/core/filter.c | 1 + net/ipv4/fib_frontend.c | 1 + net/ipv4/route.c | 1 + 4 files changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index 929d3ca614d0..b2531df3f65f 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -116,6 +116,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = sport; + fl4->flowi4_multipath_hash = 0; } /* Reset some input parameters after previous lookup */ diff --git a/net/core/filter.c b/net/core/filter.c index 1f647ab986b6..1b168371ba96 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4838,6 +4838,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl4.saddr = params->ipv4_src; fl4.fl4_sport = params->sport; fl4.fl4_dport = params->dport; + fl4.flowi4_multipath_hash = 0; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 41079490a118..86a23e4a6a50 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -362,6 +362,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); + fl4.flowi4_multipath_hash = 0; no_addr = idev->ifa_list == NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8ca6bcab7b03..e5f210d00851 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2147,6 +2147,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); + fl4.flowi4_multipath_hash = 0; if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; -- cgit v1.2.3 From 13e6ce98aa65ab5ce19351c020419360dfe8af29 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 13 Sep 2020 19:51:50 +0800 Subject: net: sched: only keep the available bits when setting vxlan md->gbp As we can see from vxlan_build/parse_gbp_hdr(), when processing metadata on vxlan rx/tx path, only dont_learn/policy_applied/policy_id fields can be set to or parse from the packet for vxlan gbp option. So we'd better do the mask when set it in act_tunnel_key and cls_flower. Otherwise, when users don't know these bits, they may configure with a value which can never be matched. Reported-by: Shuang Li Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/vxlan.h | 3 +++ net/sched/act_tunnel_key.c | 1 + net/sched/cls_flower.c | 4 +++- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 3a41627cbdfe..08537aa14f7c 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -121,6 +121,9 @@ struct vxlanhdr_gbp { #define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) #define VXLAN_GBP_ID_MASK (0xFFFF) +#define VXLAN_GBP_MASK (VXLAN_GBP_DONT_LEARN | VXLAN_GBP_POLICY_APPLIED | \ + VXLAN_GBP_ID_MASK) + /* * VXLAN Generic Protocol Extension (VXLAN_F_GPE): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 536c4bc31be6..37f1e10f35e0 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -156,6 +156,7 @@ tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len, struct vxlan_metadata *md = dst; md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]); + md->gbp &= VXLAN_GBP_MASK; } return sizeof(struct vxlan_metadata); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index a4f7ef1de7e7..e8fda1bd4d9d 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1175,8 +1175,10 @@ static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key, return -EINVAL; } - if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) + if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) { md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]); + md->gbp &= VXLAN_GBP_MASK; + } return sizeof(*md); } -- cgit v1.2.3 From fe81d9f6182d1160e625894eecb3d7ff0222cac5 Mon Sep 17 00:00:00 2001 From: Henry Ptasinski Date: Sat, 19 Sep 2020 00:12:11 +0000 Subject: net: sctp: Fix IPv6 ancestor_size calc in sctp_copy_descendant When calculating ancestor_size with IPv6 enabled, simply using sizeof(struct ipv6_pinfo) doesn't account for extra bytes needed for alignment in the struct sctp6_sock. On x86, there aren't any extra bytes, but on ARM the ipv6_pinfo structure is aligned on an 8-byte boundary so there were 4 pad bytes that were omitted from the ancestor_size calculation. This would lead to corruption of the pd_lobby pointers, causing an oops when trying to free the sctp structure on socket close. Fixes: 636d25d557d1 ("sctp: not copy sctp_sock pd_lobby in sctp_copy_descendant") Signed-off-by: Henry Ptasinski Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 8 +++++--- net/sctp/socket.c | 9 +++------ 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index b33f1aefad09..0bdff38eb4bb 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -226,12 +226,14 @@ struct sctp_sock { data_ready_signalled:1; atomic_t pd_mode; + + /* Fields after this point will be skipped on copies, like on accept + * and peeloff operations + */ + /* Receive to here while partial delivery is in effect. */ struct sk_buff_head pd_lobby; - /* These must be the last fields, as they will skipped on copies, - * like on accept and peeloff operations - */ struct list_head auto_asconf_list; int do_auto_asconf; }; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 836615f71a7d..53d0a4161df3 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9220,13 +9220,10 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, static inline void sctp_copy_descendant(struct sock *sk_to, const struct sock *sk_from) { - int ancestor_size = sizeof(struct inet_sock) + - sizeof(struct sctp_sock) - - offsetof(struct sctp_sock, pd_lobby); - - if (sk_from->sk_family == PF_INET6) - ancestor_size += sizeof(struct ipv6_pinfo); + size_t ancestor_size = sizeof(struct inet_sock); + ancestor_size += sk_from->sk_prot->obj_size; + ancestor_size -= offsetof(struct sctp_sock, pd_lobby); __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); } -- cgit v1.2.3