From 03891f820c2117b19e80b370281eb924a09cf79f Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Sun, 2 Feb 2020 13:19:34 +0200 Subject: xfrm: handle NETDEV_UNREGISTER for xfrm device This patch to handle the asynchronous unregister device event so the device IPsec offload resources could be cleanly released. Fixes: e4db5b61c572 ("xfrm: policy: remove pcpu policy cache") Signed-off-by: Raed Salem Reviewed-by: Boris Pismenny Reviewed-by: Saeed Mahameed Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 50f567a88f45..3231ec628544 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -383,6 +383,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void return xfrm_dev_feat_change(dev); case NETDEV_DOWN: + case NETDEV_UNREGISTER: return xfrm_dev_down(dev); } return NOTIFY_DONE; -- cgit v1.2.3 From f1ed10264ed6b66b9cd5e8461cffce69be482356 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 4 Feb 2020 17:00:27 +0100 Subject: vti[6]: fix packet tx through bpf_redirect() in XinY cases I forgot the 4in6/6in4 cases in my previous patch. Let's fix them. Fixes: 95224166a903 ("vti[6]: fix packet tx through bpf_redirect()") Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- net/ipv4/Kconfig | 1 + net/ipv4/ip_vti.c | 38 ++++++++++++++++++++++++++++++-------- net/ipv6/ip6_vti.c | 32 +++++++++++++++++++++++++------- 3 files changed, 56 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index f96bd489b362..6490b845e17b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -303,6 +303,7 @@ config SYN_COOKIES config NET_IPVTI tristate "Virtual (secure) IP: tunneling" + depends on IPV6 || IPV6=n select INET_TUNNEL select NET_IP_TUNNEL select XFRM diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 37cddd18f282..1b4e6f298648 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -187,17 +187,39 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, int mtu; if (!dst) { - struct rtable *rt; - - fl->u.ip4.flowi4_oif = dev->ifindex; - fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; - rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); - if (IS_ERR(rt)) { + switch (skb->protocol) { + case htons(ETH_P_IP): { + struct rtable *rt; + + fl->u.ip4.flowi4_oif = dev->ifindex; + fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + dst = &rt->dst; + skb_dst_set(skb, dst); + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + fl->u.ip6.flowi6_oif = dev->ifindex; + fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); + if (dst->error) { + dst_release(dst); + dst = NULL; + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + skb_dst_set(skb, dst); + break; +#endif + default: dev->stats.tx_carrier_errors++; goto tx_error_icmp; } - dst = &rt->dst; - skb_dst_set(skb, dst); } dst_hold(dst); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 524006aa0d78..56e642efefff 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -450,15 +450,33 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) int mtu; if (!dst) { - fl->u.ip6.flowi6_oif = dev->ifindex; - fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; - dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); - if (dst->error) { - dst_release(dst); - dst = NULL; + switch (skb->protocol) { + case htons(ETH_P_IP): { + struct rtable *rt; + + fl->u.ip4.flowi4_oif = dev->ifindex; + fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); + if (IS_ERR(rt)) + goto tx_err_link_failure; + dst = &rt->dst; + skb_dst_set(skb, dst); + break; + } + case htons(ETH_P_IPV6): + fl->u.ip6.flowi6_oif = dev->ifindex; + fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); + if (dst->error) { + dst_release(dst); + dst = NULL; + goto tx_err_link_failure; + } + skb_dst_set(skb, dst); + break; + default: goto tx_err_link_failure; } - skb_dst_set(skb, dst); } dst_hold(dst); -- cgit v1.2.3 From 171d449a028573b2f0acdc7f31ecbb045391b320 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 9 Feb 2020 21:15:29 +0800 Subject: xfrm: fix uctx len check in verify_sec_ctx_len It's not sufficient to do 'uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)' check only, as uctx->len may be greater than nla_len(rt), in which case it will cause slab-out-of-bounds when accessing uctx->ctx_str later. This patch is to fix it by return -EINVAL when uctx->len > nla_len(rt). Fixes: df71837d5024 ("[LSM-IPSec]: Security association restriction.") Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b88ba45ff1ac..38ff02d31402 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -110,7 +110,8 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs) return 0; uctx = nla_data(rt); - if (uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) + if (uctx->len > nla_len(rt) || + uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) return -EINVAL; return 0; -- cgit v1.2.3 From a1a7e3a36e01ca6e67014f8cf673cb8e47be5550 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 9 Feb 2020 21:16:38 +0800 Subject: xfrm: add the missing verify_sec_ctx_len check in xfrm_add_acquire Without doing verify_sec_ctx_len() check in xfrm_add_acquire(), it may be out-of-bounds to access uctx->ctx_str with uctx->ctx_len, as noticed by syz: BUG: KASAN: slab-out-of-bounds in selinux_xfrm_alloc_user+0x237/0x430 Read of size 768 at addr ffff8880123be9b4 by task syz-executor.1/11650 Call Trace: dump_stack+0xe8/0x16e print_address_description.cold.3+0x9/0x23b kasan_report.cold.4+0x64/0x95 memcpy+0x1f/0x50 selinux_xfrm_alloc_user+0x237/0x430 security_xfrm_policy_alloc+0x5c/0xb0 xfrm_policy_construct+0x2b1/0x650 xfrm_add_acquire+0x21d/0xa10 xfrm_user_rcv_msg+0x431/0x6f0 netlink_rcv_skb+0x15a/0x410 xfrm_netlink_rcv+0x6d/0x90 netlink_unicast+0x50e/0x6a0 netlink_sendmsg+0x8ae/0xd40 sock_sendmsg+0x133/0x170 ___sys_sendmsg+0x834/0x9a0 __sys_sendmsg+0x100/0x1e0 do_syscall_64+0xe5/0x660 entry_SYSCALL_64_after_hwframe+0x6a/0xdf So fix it by adding the missing verify_sec_ctx_len check there. Fixes: 980ebd25794f ("[IPSEC]: Sync series - acquire insert") Reported-by: Hangbin Liu Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 38ff02d31402..e6cfaa680ef3 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2274,6 +2274,9 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, xfrm_mark_get(attrs, &mark); err = verify_newpolicy_info(&ua->policy); + if (err) + goto free_state; + err = verify_sec_ctx_len(attrs); if (err) goto free_state; -- cgit v1.2.3 From 8e8ce08198de193e3d21d42e96945216e3d9ac7f Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 16 Feb 2020 13:02:06 +0100 Subject: batman-adv: Don't schedule OGM for disabled interface A transmission scheduling for an interface which is currently dropped by batadv_iv_ogm_iface_disable could still be in progress. The B.A.T.M.A.N. V is simply cancelling the workqueue item in an synchronous way but this is not possible with B.A.T.M.A.N. IV because the OGM submissions are intertwined. Instead it has to stop submitting the OGM when it detect that the buffer pointer is set to NULL. Reported-by: syzbot+a98f2016f40b9cd3818a@syzkaller.appspotmail.com Reported-by: syzbot+ac36b6a33c28a491e929@syzkaller.appspotmail.com Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Sven Eckelmann Cc: Hillf Danton Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f0209505e41a..a7c8dd7ae513 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -789,6 +789,10 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); + /* interface already disabled by batadv_iv_ogm_iface_disable */ + if (!*ogm_buff) + return; + /* the interface gets activated here to avoid race conditions between * the moment of activating the interface in * hardif_activate_interface() where the originator mac is set and -- cgit v1.2.3 From edf0d283d988c0c7da53fa2cc1388649263470de Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Fri, 21 Feb 2020 21:54:47 +0530 Subject: ipv6: xfrm6_tunnel.c: Use built-in RCU list checking hlist_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index e11bdb0aaa15..25b7ebda2fab 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -78,7 +78,7 @@ static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const hlist_for_each_entry_rcu(x6spi, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], - list_byaddr) { + list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) { if (xfrm6_addr_equal(&x6spi->addr, saddr)) return x6spi; } -- cgit v1.2.3 From 84b3268027641401bb8ad4427a90a3cce2eb86f5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 26 Feb 2020 19:47:34 +0100 Subject: netlink: Use netlink header as base to calculate bad attribute offset Userspace might send a batch that is composed of several netlink messages. The netlink_ack() function must use the pointer to the netlink header as base to calculate the bad attribute offset. Fixes: 2d4bc93368f5 ("netlink: extended ACK reporting") Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index edf3e285e242..5313f1cec170 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2434,7 +2434,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, in_skb->len)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, (u8 *)extack->bad_attr - - in_skb->data)); + (u8 *)nlh)); } else { if (extack->cookie_len) WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, -- cgit v1.2.3 From 07758eb9ff52794fba15d03aa88d92dbd1b7d125 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sat, 29 Feb 2020 17:27:13 +0800 Subject: net/ipv6: use configured metric when add peer route When we add peer address with metric configured, IPv4 could set the dest metric correctly, but IPv6 do not. e.g. ]# ip addr add 192.0.2.1 peer 192.0.2.2/32 dev eth1 metric 20 ]# ip route show dev eth1 192.0.2.2 proto kernel scope link src 192.0.2.1 metric 20 ]# ip addr add 2001:db8::1 peer 2001:db8::2/128 dev eth1 metric 20 ]# ip -6 route show dev eth1 2001:db8::1 proto kernel metric 20 pref medium 2001:db8::2 proto kernel metric 256 pref medium Fix this by using configured metric instead of default one. Reported-by: Jianlin Shi Fixes: 8308f3ff1753 ("net/ipv6: Add support for specifying metric of connected routes") Reviewed-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index cb493e15959c..164c71c54b5c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5983,9 +5983,9 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); if (!ipv6_addr_any(&ifp->peer_addr)) - addrconf_prefix_route(&ifp->peer_addr, 128, 0, - ifp->idev->dev, 0, 0, - GFP_ATOMIC); + addrconf_prefix_route(&ifp->peer_addr, 128, + ifp->rt_priority, ifp->idev->dev, + 0, 0, GFP_ATOMIC); break; case RTM_DELADDR: if (ifp->idev->cnf.forwarding) -- cgit v1.2.3 From 8750939b6ad86abc3f53ec8a9683a1cded4a5654 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:11 -0800 Subject: devlink: validate length of param values DEVLINK_ATTR_PARAM_VALUE_DATA may have different types so it's not checked by the normal netlink policy. Make sure the attribute length is what we expect. Fixes: e3b7ca18ad7b ("devlink: Add param set command") Signed-off-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 5e220809844c..8e44dc5cde73 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3352,34 +3352,41 @@ devlink_param_value_get_from_info(const struct devlink_param *param, struct genl_info *info, union devlink_param_value *value) { + struct nlattr *param_data; int len; - if (param->type != DEVLINK_PARAM_TYPE_BOOL && - !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) + param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]; + + if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data) return -EINVAL; switch (param->type) { case DEVLINK_PARAM_TYPE_U8: - value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + if (nla_len(param_data) != sizeof(u8)) + return -EINVAL; + value->vu8 = nla_get_u8(param_data); break; case DEVLINK_PARAM_TYPE_U16: - value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + if (nla_len(param_data) != sizeof(u16)) + return -EINVAL; + value->vu16 = nla_get_u16(param_data); break; case DEVLINK_PARAM_TYPE_U32: - value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + if (nla_len(param_data) != sizeof(u32)) + return -EINVAL; + value->vu32 = nla_get_u32(param_data); break; case DEVLINK_PARAM_TYPE_STRING: - len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]), - nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])); - if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) || + len = strnlen(nla_data(param_data), nla_len(param_data)); + if (len == nla_len(param_data) || len >= __DEVLINK_PARAM_MAX_STRING_VALUE) return -EINVAL; - strcpy(value->vstr, - nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])); + strcpy(value->vstr, nla_data(param_data)); break; case DEVLINK_PARAM_TYPE_BOOL: - value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ? - true : false; + if (param_data && nla_len(param_data)) + return -EINVAL; + value->vbool = nla_get_flag(param_data); break; } return 0; -- cgit v1.2.3 From ff3b63b8c299b73ac599b120653b47e275407656 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:12 -0800 Subject: devlink: validate length of region addr/len DEVLINK_ATTR_REGION_CHUNK_ADDR and DEVLINK_ATTR_REGION_CHUNK_LEN lack entries in the netlink policy. Corresponding nla_get_u64()s may read beyond the end of the message. Fixes: 4e54795a27f5 ("devlink: Add support for region snapshot read command") Signed-off-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 8e44dc5cde73..b831c5545d6a 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5958,6 +5958,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 }, + [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 }, [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, -- cgit v1.2.3 From 9322cd7c4af2ccc7fe7c5f01adb53f4f77949e92 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:14 -0800 Subject: nl802154: add missing attribute validation Add missing attribute validation for several u8 types. Fixes: 2c21d11518b6 ("net: add NL802154 interface for configuration of 802.15.4 devices") Signed-off-by: Jakub Kicinski Acked-by: Stefan Schmidt Signed-off-by: David S. Miller --- net/ieee802154/nl_policy.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c index 2c7a38d76a3a..824e7e84014c 100644 --- a/net/ieee802154/nl_policy.c +++ b/net/ieee802154/nl_policy.c @@ -21,6 +21,11 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = { [IEEE802154_ATTR_HW_ADDR] = { .type = NLA_HW_ADDR, }, [IEEE802154_ATTR_PAN_ID] = { .type = NLA_U16, }, [IEEE802154_ATTR_CHANNEL] = { .type = NLA_U8, }, + [IEEE802154_ATTR_BCN_ORD] = { .type = NLA_U8, }, + [IEEE802154_ATTR_SF_ORD] = { .type = NLA_U8, }, + [IEEE802154_ATTR_PAN_COORD] = { .type = NLA_U8, }, + [IEEE802154_ATTR_BAT_EXT] = { .type = NLA_U8, }, + [IEEE802154_ATTR_COORD_REALIGN] = { .type = NLA_U8, }, [IEEE802154_ATTR_PAGE] = { .type = NLA_U8, }, [IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, }, [IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, }, -- cgit v1.2.3 From b60673c4c418bef7550d02faf53c34fbfeb366bf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:15 -0800 Subject: nl802154: add missing attribute validation for dev_type Add missing attribute type validation for IEEE802154_ATTR_DEV_TYPE to the netlink policy. Fixes: 90c049b2c6ae ("ieee802154: interface type to be added") Signed-off-by: Jakub Kicinski Acked-by: Stefan Schmidt Signed-off-by: David S. Miller --- net/ieee802154/nl_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c index 824e7e84014c..0672b2f01586 100644 --- a/net/ieee802154/nl_policy.c +++ b/net/ieee802154/nl_policy.c @@ -27,6 +27,7 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = { [IEEE802154_ATTR_BAT_EXT] = { .type = NLA_U8, }, [IEEE802154_ATTR_COORD_REALIGN] = { .type = NLA_U8, }, [IEEE802154_ATTR_PAGE] = { .type = NLA_U8, }, + [IEEE802154_ATTR_DEV_TYPE] = { .type = NLA_U8, }, [IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, }, [IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, }, [IEEE802154_ATTR_COORD_PAN_ID] = { .type = NLA_U16, }, -- cgit v1.2.3 From b5ab1f1be6180a2e975eede18731804b5164a05d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:18 -0800 Subject: openvswitch: add missing attribute validation for hash Add missing attribute validation for OVS_PACKET_ATTR_HASH to the netlink policy. Fixes: bd1903b7c459 ("net: openvswitch: add hash info to upcall") Signed-off-by: Jakub Kicinski Reviewed-by: Greg Rose Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index c047afd12116..07a7dd185995 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -645,6 +645,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, + [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, }; static const struct genl_ops dp_packet_genl_ops[] = { -- cgit v1.2.3 From 7e6dc03eeb023e18427a373522f1d247b916a641 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:19 -0800 Subject: net: fq: add missing attribute validation for orphan mask Add missing attribute validation for TCA_FQ_ORPHAN_MASK to the netlink policy. Fixes: 06eb395fa985 ("pkt_sched: fq: better control of DDOS traffic") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a5a295477ecc..371ad84def3b 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -744,6 +744,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, + [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, }; -- cgit v1.2.3 From e13aaa0643da10006ec35715954e7f92a62899a5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:20 -0800 Subject: net: taprio: add missing attribute validation for txtime delay Add missing attribute validation for TCA_TAPRIO_ATTR_TXTIME_DELAY to the netlink policy. Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode") Signed-off-by: Jakub Kicinski Reviewed-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 660fc45ee40f..ee717e05372b 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -768,6 +768,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, + [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, }; static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, -- cgit v1.2.3 From 213320a67962ff6e7b83b704d55cbebc341426db Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:23 -0800 Subject: tipc: add missing attribute validation for MTU property Add missing attribute validation for TIPC_NLA_PROP_MTU to the netlink policy. Fixes: 901271e0403a ("tipc: implement configuration of UDP media MTU") Signed-off-by: Jakub Kicinski Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tipc/netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 7c35094c20b8..bb9862410e68 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -116,6 +116,7 @@ const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 }, [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 }, [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_MTU] = { .type = NLA_U32 }, [TIPC_NLA_PROP_BROADCAST] = { .type = NLA_U32 }, [TIPC_NLA_PROP_BROADCAST_RATIO] = { .type = NLA_U32 } }; -- cgit v1.2.3 From 361d23e41ca6e504033f7e66a03b95788377caae Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:24 -0800 Subject: nfc: add missing attribute validation for SE API Add missing attribute validation for NFC_ATTR_SE_INDEX to the netlink policy. Fixes: 5ce3f32b5264 ("NFC: netlink: SE API implementation") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/nfc/netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index eee0dddb7749..842407a48f96 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -43,6 +43,7 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, + [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, -- cgit v1.2.3 From 88e706d5168b07df4792dbc3d1bc37b83e4bd74d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:25 -0800 Subject: nfc: add missing attribute validation for deactivate target Add missing attribute validation for NFC_ATTR_TARGET_INDEX to the netlink policy. Fixes: 4d63adfe12dd ("NFC: Add NFC_CMD_DEACTIVATE_TARGET support") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/nfc/netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 842407a48f96..e988ca486d66 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -32,6 +32,7 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING, .len = NFC_DEVICE_NAME_MAXSIZE }, [NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 }, + [NFC_ATTR_TARGET_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_COMM_MODE] = { .type = NLA_U8 }, [NFC_ATTR_RF_MODE] = { .type = NLA_U8 }, [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 }, -- cgit v1.2.3 From 6ba3da446551f2150fadbf8c7788edcb977683d3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:26 -0800 Subject: nfc: add missing attribute validation for vendor subcommand Add missing attribute validation for vendor subcommand attributes to the netlink policy. Fixes: 9e58095f9660 ("NFC: netlink: Implement vendor command support") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/nfc/netlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index e988ca486d66..e894254c17d4 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -46,6 +46,8 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { .len = NFC_FIRMWARE_NAME_MAXSIZE }, [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, + [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 }, + [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, }; -- cgit v1.2.3 From 617940123e0140521f3080d2befc2bf55bcda094 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 3 Mar 2020 14:37:34 +0800 Subject: net/ipv6: need update peer route when modify metric When we modify the route metric, the peer address's route need also be updated. Before the fix: + ip addr add dev dummy1 2001:db8::1 peer 2001:db8::2 metric 60 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 60 pref medium 2001:db8::2 proto kernel metric 60 pref medium + ip addr change dev dummy1 2001:db8::1 peer 2001:db8::2 metric 61 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 61 pref medium 2001:db8::2 proto kernel metric 60 pref medium After the fix: + ip addr change dev dummy1 2001:db8::1 peer 2001:db8::2 metric 61 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 61 pref medium 2001:db8::2 proto kernel metric 61 pref medium Fixes: 8308f3ff1753 ("net/ipv6: Add support for specifying metric of connected routes") Signed-off-by: Hangbin Liu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 164c71c54b5c..4fb72028ca45 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4586,12 +4586,14 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, } static int modify_prefix_route(struct inet6_ifaddr *ifp, - unsigned long expires, u32 flags) + unsigned long expires, u32 flags, + bool modify_peer) { struct fib6_info *f6i; u32 prio; - f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, + f6i = addrconf_get_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr, + ifp->prefix_len, ifp->idev->dev, 0, RTF_DEFAULT, true); if (!f6i) return -ENOENT; @@ -4602,7 +4604,8 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, ip6_del_rt(dev_net(ifp->idev->dev), f6i); /* add new one */ - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr, + ifp->prefix_len, ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); } else { @@ -4678,7 +4681,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) int rc = -ENOENT; if (had_prefixroute) - rc = modify_prefix_route(ifp, expires, flags); + rc = modify_prefix_route(ifp, expires, flags, false); /* prefix route could have been deleted; if so restore it */ if (rc == -ENOENT) { @@ -4686,6 +4689,15 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); } + + if (had_prefixroute && !ipv6_addr_any(&ifp->peer_addr)) + rc = modify_prefix_route(ifp, expires, flags, true); + + if (rc == -ENOENT && !ipv6_addr_any(&ifp->peer_addr)) { + addrconf_prefix_route(&ifp->peer_addr, ifp->prefix_len, + ifp->rt_priority, ifp->idev->dev, + expires, flags, GFP_KERNEL); + } } else if (had_prefixroute) { enum cleanup_prefix_rt_t action; unsigned long rt_expires; -- cgit v1.2.3 From d0098e4c6b83e502cc1cd96d67ca86bc79a6c559 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 3 Mar 2020 14:37:35 +0800 Subject: net/ipv6: remove the old peer route if change it to a new one When we modify the peer route and changed it to a new one, we should remove the old route first. Before the fix: + ip addr add dev dummy1 2001:db8::1 peer 2001:db8::2 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 256 pref medium 2001:db8::2 proto kernel metric 256 pref medium + ip addr change dev dummy1 2001:db8::1 peer 2001:db8::3 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 256 pref medium 2001:db8::2 proto kernel metric 256 pref medium After the fix: + ip addr change dev dummy1 2001:db8::1 peer 2001:db8::3 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 256 pref medium 2001:db8::3 proto kernel metric 256 pref medium This patch depend on the previous patch "net/ipv6: need update peer route when modify metric" to update new peer route after delete old one. Signed-off-by: Hangbin Liu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4fb72028ca45..e6e1290ea06f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1226,11 +1226,13 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) } static void -cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) +cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, + bool del_rt, bool del_peer) { struct fib6_info *f6i; - f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, + f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr, + ifp->prefix_len, ifp->idev->dev, 0, RTF_DEFAULT, true); if (f6i) { if (del_rt) @@ -1293,7 +1295,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (action != CLEANUP_PREFIX_RT_NOP) { cleanup_prefix_route(ifp, expires, - action == CLEANUP_PREFIX_RT_DEL); + action == CLEANUP_PREFIX_RT_DEL, false); } /* clean up prefsrc entries */ @@ -4627,6 +4629,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) unsigned long timeout; bool was_managetempaddr; bool had_prefixroute; + bool new_peer = false; ASSERT_RTNL(); @@ -4658,6 +4661,13 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) cfg->preferred_lft = timeout; } + if (cfg->peer_pfx && + memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) { + if (!ipv6_addr_any(&ifp->peer_addr)) + cleanup_prefix_route(ifp, expires, true, true); + new_peer = true; + } + spin_lock_bh(&ifp->lock); was_managetempaddr = ifp->flags & IFA_F_MANAGETEMPADDR; had_prefixroute = ifp->flags & IFA_F_PERMANENT && @@ -4673,6 +4683,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority) ifp->rt_priority = cfg->rt_priority; + if (new_peer) + ifp->peer_addr = *cfg->peer_pfx; + spin_unlock_bh(&ifp->lock); if (!(ifp->flags&IFA_F_TENTATIVE)) ipv6_ifa_notify(0, ifp); @@ -4708,7 +4721,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) if (action != CLEANUP_PREFIX_RT_NOP) { cleanup_prefix_route(ifp, rt_expires, - action == CLEANUP_PREFIX_RT_DEL); + action == CLEANUP_PREFIX_RT_DEL, false); } } -- cgit v1.2.3 From 8640f8dc6d657ebfb4e67c202ad32c5457858a13 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 3 Mar 2020 15:01:46 +0000 Subject: net: dsa: fix phylink_start()/phylink_stop() calls Place phylink_start()/phylink_stop() inside dsa_port_enable() and dsa_port_disable(), which ensures that we call phylink_stop() before tearing down phylink - which is a documented requirement. Failure to do so can cause use-after-free bugs. Fixes: 0e27921816ad ("net: dsa: Use PHYLINK for the CPU/DSA ports") Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 2 ++ net/dsa/port.c | 32 ++++++++++++++++++++++++++------ net/dsa/slave.c | 8 ++------ 3 files changed, 30 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index a7662e7a691d..760e6ea3178a 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -117,7 +117,9 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); +int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy); int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); +void dsa_port_disable_rt(struct dsa_port *dp); void dsa_port_disable(struct dsa_port *dp); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); diff --git a/net/dsa/port.c b/net/dsa/port.c index 774facb8d547..ed7dabb57985 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -63,7 +63,7 @@ static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } -int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) +int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy) { struct dsa_switch *ds = dp->ds; int port = dp->index; @@ -78,14 +78,31 @@ int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) if (!dp->bridge_dev) dsa_port_set_state_now(dp, BR_STATE_FORWARDING); + if (dp->pl) + phylink_start(dp->pl); + return 0; } -void dsa_port_disable(struct dsa_port *dp) +int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) +{ + int err; + + rtnl_lock(); + err = dsa_port_enable_rt(dp, phy); + rtnl_unlock(); + + return err; +} + +void dsa_port_disable_rt(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; int port = dp->index; + if (dp->pl) + phylink_stop(dp->pl); + if (!dp->bridge_dev) dsa_port_set_state_now(dp, BR_STATE_DISABLED); @@ -93,6 +110,13 @@ void dsa_port_disable(struct dsa_port *dp) ds->ops->port_disable(ds, port); } +void dsa_port_disable(struct dsa_port *dp) +{ + rtnl_lock(); + dsa_port_disable_rt(dp); + rtnl_unlock(); +} + int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { @@ -614,10 +638,6 @@ static int dsa_port_phylink_register(struct dsa_port *dp) goto err_phy_connect; } - rtnl_lock(); - phylink_start(dp->pl); - rtnl_unlock(); - return 0; err_phy_connect: diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 088c886e609e..ddc0f9236928 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -88,12 +88,10 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - err = dsa_port_enable(dp, dev->phydev); + err = dsa_port_enable_rt(dp, dev->phydev); if (err) goto clear_promisc; - phylink_start(dp->pl); - return 0; clear_promisc: @@ -114,9 +112,7 @@ static int dsa_slave_close(struct net_device *dev) struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); - phylink_stop(dp->pl); - - dsa_port_disable(dp); + dsa_port_disable_rt(dp); dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); -- cgit v1.2.3 From dc15af8e9dbd039ebb06336597d2c491ef46ab74 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 25 Feb 2020 10:05:47 +0300 Subject: netfilter: nf_conntrack: ct_cpu_seq_next should increase position index If .next function does not change position index, following .show function will repeat output related to current position index. Cc: stable@vger.kernel.org Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code ...") Link: https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_standalone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 410809c669e1..4912069627b6 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -411,7 +411,7 @@ static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } - + (*pos)++; return NULL; } -- cgit v1.2.3 From bb71f846a0002239f7058c84f1496648ff4a5c20 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 25 Feb 2020 10:05:59 +0300 Subject: netfilter: synproxy: synproxy_cpu_seq_next should increase position index If .next function does not change position index, following .show function will repeat output related to current position index. Cc: stable@vger.kernel.org Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code ...") Link: https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_synproxy_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index b0930d4aba22..b9cbe1e2453e 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -267,7 +267,7 @@ static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu + 1; return per_cpu_ptr(snet->stats, cpu); } - + (*pos)++; return NULL; } -- cgit v1.2.3 From db25517a550926f609c63054b12ea9ad515e1a10 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 25 Feb 2020 10:06:29 +0300 Subject: netfilter: xt_recent: recent_seq_next should increase position index If .next function does not change position index, following .show function will repeat output related to current position index. Without the patch: # dd if=/proc/net/xt_recent/SSH # original file outpt src=127.0.0.4 ttl: 0 last_seen: 6275444819 oldest_pkt: 1 6275444819 src=127.0.0.2 ttl: 0 last_seen: 6275438906 oldest_pkt: 1 6275438906 src=127.0.0.3 ttl: 0 last_seen: 6275441953 oldest_pkt: 1 6275441953 0+1 records in 0+1 records out 204 bytes copied, 6.1332e-05 s, 3.3 MB/s Read after lseek into middle of last line (offset 140 in example below) generates expected end of last line and then unexpected whole last line once again # dd if=/proc/net/xt_recent/SSH bs=140 skip=1 dd: /proc/net/xt_recent/SSH: cannot skip to specified offset 127.0.0.3 ttl: 0 last_seen: 6275441953 oldest_pkt: 1 6275441953 src=127.0.0.3 ttl: 0 last_seen: 6275441953 oldest_pkt: 1 6275441953 0+1 records in 0+1 records out 132 bytes copied, 6.2487e-05 s, 2.1 MB/s Cc: stable@vger.kernel.org Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code ...") Link: https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_recent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 0a9708004e20..225a7ab6d79a 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -492,12 +492,12 @@ static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) const struct recent_entry *e = v; const struct list_head *head = e->list.next; + (*pos)++; while (head == &t->iphash[st->bucket]) { if (++st->bucket >= ip_list_hash_size) return NULL; head = t->iphash[st->bucket].next; } - (*pos)++; return list_entry(head, struct recent_entry, list); } -- cgit v1.2.3 From ee84f19cbbe9cf7cba2958acb03163fed3ecbb0f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 25 Feb 2020 10:07:12 +0300 Subject: netfilter: x_tables: xt_mttg_seq_next should increase position index If .next function does not change position index, following .show function will repeat output related to current position index. Without patch: # dd if=/proc/net/ip_tables_matches # original file output conntrack conntrack conntrack recent recent icmp udplite udp tcp 0+1 records in 0+1 records out 65 bytes copied, 5.4074e-05 s, 1.2 MB/s # dd if=/proc/net/ip_tables_matches bs=62 skip=1 dd: /proc/net/ip_tables_matches: cannot skip to specified offset cp <<< end of last line tcp <<< and then unexpected whole last line once again 0+1 records in 0+1 records out 7 bytes copied, 0.000102447 s, 68.3 kB/s Cc: stable@vger.kernel.org Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code ...") Link: https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index e27c6c5ba9df..cd2b034eef59 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1551,6 +1551,9 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; + if (ppos != NULL) + ++(*ppos); + switch (trav->class) { case MTTG_TRAV_INIT: trav->class = MTTG_TRAV_NFP_UNSPEC; @@ -1576,9 +1579,6 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, default: return NULL; } - - if (ppos != NULL) - ++*ppos; return trav; } -- cgit v1.2.3 From 2d285f26ecd072800a29c5b71e63437f21ef830a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 2 Mar 2020 21:58:50 +0100 Subject: netfilter: nf_tables: free flowtable hooks on hook register error If hook registration fails, the hooks allocated via nft_netdev_hook_alloc need to be freed. We can't change the goto label to 'goto 5' -- while it does fix the memleak it does cause a warning splat from the netfilter core (the hooks were not registered). Fixes: 3f0465a9ef02 ("netfilter: nf_tables: dynamically allocate hooks per net_device in flowtables") Reported-by: syzbot+a2ff6fa45162a5ed4dd3@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d1318bdf49ca..bb064aa4154b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6300,8 +6300,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err4; err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); - if (err < 0) + if (err < 0) { + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } goto err4; + } err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) -- cgit v1.2.3 From c049b3450072b8e3998053490e025839fecfef31 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:08:31 -0800 Subject: netfilter: cthelper: add missing attribute validation for cthelper Add missing attribute validation for cthelper to the netlink policy. Fixes: 12f7a505331e ("netfilter: add user-space connection tracking helper infrastructure") Signed-off-by: Jakub Kicinski Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cthelper.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index de3a9596b7f1..a5f294aa8e4c 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -742,6 +742,8 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { [NFCTH_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN-1 }, [NFCTH_QUEUE_NUM] = { .type = NLA_U32, }, + [NFCTH_PRIV_DATA_LEN] = { .type = NLA_U32, }, + [NFCTH_STATUS] = { .type = NLA_U32, }, }; static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { -- cgit v1.2.3 From 9d6effb2f1523eb84516e44213c00f2fd9e6afff Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:08:32 -0800 Subject: netfilter: nft_payload: add missing attribute validation for payload csum flags Add missing attribute validation for NFTA_PAYLOAD_CSUM_FLAGS to the netlink policy. Fixes: 1814096980bb ("netfilter: nft_payload: layer 4 checksum adjustment for pseudoheader fields") Signed-off-by: Jakub Kicinski Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 1993af3a2979..a7de3a58f553 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -129,6 +129,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 }, }; static int nft_payload_init(const struct nft_ctx *ctx, -- cgit v1.2.3 From 88a637719a1570705c02cacb3297af164b1714e7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:08:33 -0800 Subject: netfilter: nft_tunnel: add missing attribute validation for tunnels Add missing attribute validation for tunnel source and destination ports to the netlink policy. Fixes: af308b94a2a4 ("netfilter: nf_tables: add tunnel support") Signed-off-by: Jakub Kicinski Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tunnel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 4c3f2e24c7cb..764e88682a81 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -339,6 +339,8 @@ static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] = [NFTA_TUNNEL_KEY_FLAGS] = { .type = NLA_U32, }, [NFTA_TUNNEL_KEY_TOS] = { .type = NLA_U8, }, [NFTA_TUNNEL_KEY_TTL] = { .type = NLA_U8, }, + [NFTA_TUNNEL_KEY_SPORT] = { .type = NLA_U16, }, + [NFTA_TUNNEL_KEY_DPORT] = { .type = NLA_U16, }, [NFTA_TUNNEL_KEY_OPTS] = { .type = NLA_NESTED, }, }; -- cgit v1.2.3 From d1d17a359ce6901545c075d7401c10179d9cedfd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 4 Mar 2020 16:51:42 +0800 Subject: esp: remove the skb from the chain when it's enqueued in cryptd_wq Xiumei found a panic in esp offload: BUG: unable to handle kernel NULL pointer dereference at 0000000000000020 RIP: 0010:esp_output_done+0x101/0x160 [esp4] Call Trace: ? esp_output+0x180/0x180 [esp4] cryptd_aead_crypt+0x4c/0x90 cryptd_queue_worker+0x6e/0xa0 process_one_work+0x1a7/0x3b0 worker_thread+0x30/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x112/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x35/0x40 It was caused by that skb secpath is used in esp_output_done() after it's been released elsewhere. The tx path for esp offload is: __dev_queue_xmit()-> validate_xmit_skb_list()-> validate_xmit_xfrm()-> esp_xmit()-> esp_output_tail()-> aead_request_set_callback(esp_output_done) <--[1] crypto_aead_encrypt() <--[2] In [1], .callback is set, and in [2] it will trigger the worker schedule, later on a kernel thread will call .callback(esp_output_done), as the call trace shows. But in validate_xmit_xfrm(): skb_list_walk_safe(skb, skb2, nskb) { ... err = x->type_offload->xmit(x, skb2, esp_features); [esp_xmit] ... } When the err is -EINPROGRESS, which means this skb2 will be enqueued and later gets encrypted and sent out by .callback later in a kernel thread, skb2 should be removed fromt skb chain. Otherwise, it will get processed again outside validate_xmit_xfrm(), which could release skb secpath, and cause the panic above. This patch is to remove the skb from the chain when it's enqueued in cryptd_wq. While at it, remove the unnecessary 'if (!skb)' check. Fixes: 3dca3f38cfb8 ("xfrm: Separate ESP handling from segmentation for GRO packets.") Reported-by: Xiumei Mu Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 3231ec628544..e2db468cf50e 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -78,8 +78,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur int err; unsigned long flags; struct xfrm_state *x; - struct sk_buff *skb2, *nskb; struct softnet_data *sd; + struct sk_buff *skb2, *nskb, *pskb = NULL; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; @@ -168,14 +168,14 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur } else { if (skb == skb2) skb = nskb; - - if (!skb) - return NULL; + else + pskb->next = nskb; continue; } skb_push(skb2, skb2->data - skb_mac_header(skb2)); + pskb = skb2; } return skb; -- cgit v1.2.3 From d78008de6103c708171baff9650a7862645d23b0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 3 Mar 2020 15:02:45 +0100 Subject: netfilter: nf_tables: dump NFTA_CHAIN_FLAGS attribute Missing NFTA_CHAIN_FLAGS netlink attribute when dumping basechain definitions. Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bb064aa4154b..f9e60981bd36 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1405,6 +1405,11 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, lockdep_commit_lock_is_held(net)); if (nft_dump_stats(skb, stats)) goto nla_put_failure; + + if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) && + nla_put_be32(skb, NFTA_CHAIN_FLAGS, + htonl(NFT_CHAIN_HW_OFFLOAD))) + goto nla_put_failure; } if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) -- cgit v1.2.3 From 1d305ba40eb8081ff21eeb8ca6ba5c70fd920934 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 5 Mar 2020 11:15:36 +0100 Subject: netfilter: nf_tables: fix infinite loop when expr is not available nft will loop forever if the kernel doesn't support an expression: 1. nft_expr_type_get() appends the family specific name to the module list. 2. -EAGAIN is returned to nfnetlink, nfnetlink calls abort path. 3. abort path sets ->done to true and calls request_module for the expression. 4. nfnetlink replays the batch, we end up in nft_expr_type_get() again. 5. nft_expr_type_get attempts to append family-specific name. This one already exists on the list, so we continue 6. nft_expr_type_get adds the generic expression name to the module list. -EAGAIN is returned, nfnetlink calls abort path. 7. abort path encounters the family-specific expression which has 'done' set, so it gets removed. 8. abort path requests the generic expression name, sets done to true. 9. batch is replayed. If the expression could not be loaded, then we will end up back at 1), because the family-specific name got removed and the cycle starts again. Note that userspace can SIGKILL the nft process to stop the cycle, but the desired behaviour is to return an error after the generic expr name fails to load the expression. Fixes: eb014de4fd418 ("netfilter: nf_tables: autoload modules from the abort path") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f9e60981bd36..38c680f28f15 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7388,13 +7388,8 @@ static void nf_tables_module_autoload(struct net *net) list_splice_init(&net->nft.module_list, &module_list); mutex_unlock(&net->nft.commit_mutex); list_for_each_entry_safe(req, next, &module_list, list) { - if (req->done) { - list_del(&req->list); - kfree(req); - } else { - request_module("%s", req->module); - req->done = true; - } + request_module("%s", req->module); + req->done = true; } mutex_lock(&net->nft.commit_mutex); list_splice(&module_list, &net->nft.module_list); @@ -8177,6 +8172,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) __nft_release_tables(net); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); + WARN_ON_ONCE(!list_empty(&net->nft.module_list)); } static struct pernet_operations nf_tables_net_ops = { -- cgit v1.2.3 From a3aefbfe45751bf7b338c181b97608e276b5bb73 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Mar 2020 17:24:31 +0300 Subject: net: nfc: fix bounds checking bugs on "pipe" This is similar to commit 674d9de02aa7 ("NFC: Fix possible memory corruption when handling SHDLC I-Frame commands") and commit d7ee81ad09f0 ("NFC: nci: Add some bounds checking in nci_hci_cmd_received()") which added range checks on "pipe". The "pipe" variable comes skb->data[0] in nfc_hci_msg_rx_work(). It's in the 0-255 range. We're using it as the array index into the hdev->pipes[] array which has NFC_HCI_MAX_PIPES (128) members. Fixes: 118278f20aa8 ("NFC: hci: Add pipes table to reference them with a tuple {gate, host}") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/nfc/hci/core.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 6f1b096e601c..43811b5219b5 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -181,13 +181,20 @@ exit: void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, struct sk_buff *skb) { - u8 gate = hdev->pipes[pipe].gate; u8 status = NFC_HCI_ANY_OK; struct hci_create_pipe_resp *create_info; struct hci_delete_pipe_noti *delete_info; struct hci_all_pipe_cleared_noti *cleared_info; + u8 gate; - pr_debug("from gate %x pipe %x cmd %x\n", gate, pipe, cmd); + pr_debug("from pipe %x cmd %x\n", pipe, cmd); + + if (pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + + gate = hdev->pipes[pipe].gate; switch (cmd) { case NFC_HCI_ADM_NOTIFY_PIPE_CREATED: @@ -375,8 +382,14 @@ void nfc_hci_event_received(struct nfc_hci_dev *hdev, u8 pipe, u8 event, struct sk_buff *skb) { int r = 0; - u8 gate = hdev->pipes[pipe].gate; + u8 gate; + + if (pipe >= NFC_HCI_MAX_PIPES) { + pr_err("Discarded event %x to invalid pipe %x\n", event, pipe); + goto exit; + } + gate = hdev->pipes[pipe].gate; if (gate == NFC_HCI_INVALID_GATE) { pr_err("Discarded event %x to unopened pipe %x\n", event, pipe); goto exit; -- cgit v1.2.3 From 2398e3991bda7caa6b112a6f650fbab92f732b91 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 4 Mar 2020 16:51:07 +0100 Subject: mptcp: always include dack if possible. Currently passive MPTCP socket can skip including the DACK option - if the peer sends data before accept() completes. The above happens because the msk 'can_ack' flag is set only after the accept() call. Such missing DACK option may cause - as per RFC spec - unwanted fallback to TCP. This change addresses the issue using the key material available in the current subflow, if any, to create a suitable dack option when msk ack seq is not yet available. v1 -> v2: - adavance the generated ack after the initial MPC packet Fixes: d22f4988ffec ("mptcp: process MP_CAPABLE data option") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 45acd877bef3..fd2c3150e591 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -334,6 +334,8 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, struct mptcp_sock *msk; unsigned int ack_size; bool ret = false; + bool can_ack; + u64 ack_seq; u8 tcp_fin; if (skb) { @@ -360,9 +362,22 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, ret = true; } + /* passive sockets msk will set the 'can_ack' after accept(), even + * if the first subflow may have the already the remote key handy + */ + can_ack = true; opts->ext_copy.use_ack = 0; msk = mptcp_sk(subflow->conn); - if (!msk || !READ_ONCE(msk->can_ack)) { + if (likely(msk && READ_ONCE(msk->can_ack))) { + ack_seq = msk->ack_seq; + } else if (subflow->can_ack) { + mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); + ack_seq++; + } else { + can_ack = false; + } + + if (unlikely(!can_ack)) { *size = ALIGN(dss_size, 4); return ret; } @@ -375,7 +390,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, dss_size += ack_size; - opts->ext_copy.data_ack = msk->ack_seq; + opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; opts->ext_copy.use_ack = 1; -- cgit v1.2.3 From 6a42cefb25d8bdc1b391f4a53c78c32164eea2dd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 6 Mar 2020 17:37:28 +0100 Subject: netfilter: nft_chain_nat: inet family is missing module ownership Set owner to THIS_MODULE, otherwise the nft_chain_nat module might be removed while there are still inet/nat chains in place. [ 117.942096] BUG: unable to handle page fault for address: ffffffffa0d5e040 [ 117.942101] #PF: supervisor read access in kernel mode [ 117.942103] #PF: error_code(0x0000) - not-present page [ 117.942106] PGD 200c067 P4D 200c067 PUD 200d063 PMD 3dc909067 PTE 0 [ 117.942113] Oops: 0000 [#1] PREEMPT SMP PTI [ 117.942118] CPU: 3 PID: 27 Comm: kworker/3:0 Not tainted 5.6.0-rc3+ #348 [ 117.942133] Workqueue: events nf_tables_trans_destroy_work [nf_tables] [ 117.942145] RIP: 0010:nf_tables_chain_destroy.isra.0+0x94/0x15a [nf_tables] [ 117.942149] Code: f6 45 54 01 0f 84 d1 00 00 00 80 3b 05 74 44 48 8b 75 e8 48 c7 c7 72 be de a0 e8 56 e6 2d e0 48 8b 45 e8 48 c7 c7 7f be de a0 <48> 8b 30 e8 43 e6 2d e0 48 8b 45 e8 48 8b 40 10 48 85 c0 74 5b 8b [ 117.942152] RSP: 0018:ffffc9000015be10 EFLAGS: 00010292 [ 117.942155] RAX: ffffffffa0d5e040 RBX: ffff88840be87fc2 RCX: 0000000000000007 [ 117.942158] RDX: 0000000000000007 RSI: 0000000000000086 RDI: ffffffffa0debe7f [ 117.942160] RBP: ffff888403b54b50 R08: 0000000000001482 R09: 0000000000000004 [ 117.942162] R10: 0000000000000000 R11: 0000000000000001 R12: ffff8883eda7e540 [ 117.942164] R13: dead000000000122 R14: dead000000000100 R15: ffff888403b3db80 [ 117.942167] FS: 0000000000000000(0000) GS:ffff88840e4c0000(0000) knlGS:0000000000000000 [ 117.942169] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 117.942172] CR2: ffffffffa0d5e040 CR3: 00000003e4c52002 CR4: 00000000001606e0 [ 117.942174] Call Trace: [ 117.942188] nf_tables_trans_destroy_work.cold+0xd/0x12 [nf_tables] [ 117.942196] process_one_work+0x1d6/0x3b0 [ 117.942200] worker_thread+0x45/0x3c0 [ 117.942203] ? process_one_work+0x3b0/0x3b0 [ 117.942210] kthread+0x112/0x130 [ 117.942214] ? kthread_create_worker_on_cpu+0x40/0x40 [ 117.942221] ret_from_fork+0x35/0x40 nf_tables_chain_destroy() crashes on module_put() because the module is gone. Fixes: d164385ec572 ("netfilter: nat: add inet family nat support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_chain_nat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index ff9ac8ae0031..eac4a901233f 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -89,6 +89,7 @@ static const struct nft_chain_type nft_chain_nat_inet = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_INET, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | -- cgit v1.2.3 From 17c25cafd4d3e74c83dce56b158843b19c40b414 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Mar 2020 22:05:14 -0800 Subject: gre: fix uninit-value in __iptunnel_pull_header syzbot found an interesting case of the kernel reading an uninit-value [1] Problem is in the handling of ETH_P_WCCP in gre_parse_header() We look at the byte following GRE options to eventually decide if the options are four bytes longer. Use skb_header_pointer() to not pull bytes if we found that no more bytes were needed. All callers of gre_parse_header() are properly using pskb_may_pull() anyway before proceeding to next header. [1] BUG: KMSAN: uninit-value in pskb_may_pull include/linux/skbuff.h:2303 [inline] BUG: KMSAN: uninit-value in __iptunnel_pull_header+0x30c/0xbd0 net/ipv4/ip_tunnel_core.c:94 CPU: 1 PID: 11784 Comm: syz-executor940 Not tainted 5.6.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x220 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:118 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 pskb_may_pull include/linux/skbuff.h:2303 [inline] __iptunnel_pull_header+0x30c/0xbd0 net/ipv4/ip_tunnel_core.c:94 iptunnel_pull_header include/net/ip_tunnels.h:411 [inline] gre_rcv+0x15e/0x19c0 net/ipv6/ip6_gre.c:606 ip6_protocol_deliver_rcu+0x181b/0x22c0 net/ipv6/ip6_input.c:432 ip6_input_finish net/ipv6/ip6_input.c:473 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ip6_input net/ipv6/ip6_input.c:482 [inline] ip6_mc_input+0xdf2/0x1460 net/ipv6/ip6_input.c:576 dst_input include/net/dst.h:442 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ipv6_rcv+0x683/0x710 net/ipv6/ip6_input.c:306 __netif_receive_skb_one_core net/core/dev.c:5198 [inline] __netif_receive_skb net/core/dev.c:5312 [inline] netif_receive_skb_internal net/core/dev.c:5402 [inline] netif_receive_skb+0x66b/0xf20 net/core/dev.c:5461 tun_rx_batched include/linux/skbuff.h:4321 [inline] tun_get_user+0x6aef/0x6f60 drivers/net/tun.c:1997 tun_chr_write_iter+0x1f2/0x360 drivers/net/tun.c:2026 call_write_iter include/linux/fs.h:1901 [inline] new_sync_write fs/read_write.c:483 [inline] __vfs_write+0xa5a/0xca0 fs/read_write.c:496 vfs_write+0x44a/0x8f0 fs/read_write.c:558 ksys_write+0x267/0x450 fs/read_write.c:611 __do_sys_write fs/read_write.c:623 [inline] __se_sys_write fs/read_write.c:620 [inline] __ia32_sys_write+0xdb/0x120 fs/read_write.c:620 do_syscall_32_irqs_on arch/x86/entry/common.c:339 [inline] do_fast_syscall_32+0x3c7/0x6e0 arch/x86/entry/common.c:410 entry_SYSENTER_compat+0x68/0x77 arch/x86/entry/entry_64_compat.S:139 RIP: 0023:0xf7f62d99 Code: 90 e8 0b 00 00 00 f3 90 0f ae e8 eb f9 8d 74 26 00 89 3c 24 c3 90 90 90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 002b:00000000fffedb2c EFLAGS: 00000217 ORIG_RAX: 0000000000000004 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000020002580 RDX: 0000000000000fca RSI: 0000000000000036 RDI: 0000000000000004 RBP: 0000000000008914 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:144 [inline] kmsan_internal_poison_shadow+0x66/0xd0 mm/kmsan/kmsan.c:127 kmsan_slab_alloc+0x8a/0xe0 mm/kmsan/kmsan_hooks.c:82 slab_alloc_node mm/slub.c:2793 [inline] __kmalloc_node_track_caller+0xb40/0x1200 mm/slub.c:4401 __kmalloc_reserve net/core/skbuff.c:142 [inline] __alloc_skb+0x2fd/0xac0 net/core/skbuff.c:210 alloc_skb include/linux/skbuff.h:1051 [inline] alloc_skb_with_frags+0x18c/0xa70 net/core/skbuff.c:5766 sock_alloc_send_pskb+0xada/0xc60 net/core/sock.c:2242 tun_alloc_skb drivers/net/tun.c:1529 [inline] tun_get_user+0x10ae/0x6f60 drivers/net/tun.c:1843 tun_chr_write_iter+0x1f2/0x360 drivers/net/tun.c:2026 call_write_iter include/linux/fs.h:1901 [inline] new_sync_write fs/read_write.c:483 [inline] __vfs_write+0xa5a/0xca0 fs/read_write.c:496 vfs_write+0x44a/0x8f0 fs/read_write.c:558 ksys_write+0x267/0x450 fs/read_write.c:611 __do_sys_write fs/read_write.c:623 [inline] __se_sys_write fs/read_write.c:620 [inline] __ia32_sys_write+0xdb/0x120 fs/read_write.c:620 do_syscall_32_irqs_on arch/x86/entry/common.c:339 [inline] do_fast_syscall_32+0x3c7/0x6e0 arch/x86/entry/common.c:410 entry_SYSENTER_compat+0x68/0x77 arch/x86/entry/entry_64_compat.S:139 Fixes: 95f5c64c3c13 ("gre: Move utility functions to common headers") Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 5fd6e8ed02b5..66fdbfe5447c 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -56,7 +56,9 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) } EXPORT_SYMBOL_GPL(gre_del_protocol); -/* Fills in tpi and returns header length to be pulled. */ +/* Fills in tpi and returns header length to be pulled. + * Note that caller must use pskb_may_pull() before pulling GRE header. + */ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs) { @@ -110,8 +112,14 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header */ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { + u8 _val, *val; + + val = skb_header_pointer(skb, nhs + hdr_len, + sizeof(_val), &_val); + if (!val) + return -EINVAL; tpi->proto = proto; - if ((*(u8 *)options & 0xF0) != 0x40) + if ((*val & 0xF0) != 0x40) hdr_len += 4; } tpi->hdr_len = hdr_len; -- cgit v1.2.3 From 83f73c5bb7b9a9135173f0ba2b1aa00c06664ff9 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 5 Mar 2020 15:33:12 +0300 Subject: inet_diag: return classid for all socket types In commit 1ec17dbd90f8 ("inet_diag: fix reporting cgroup classid and fallback to priority") croup classid reporting was fixed. But this works only for TCP sockets because for other socket types icsk parameter can be NULL and classid code path is skipped. This change moves classid handling to inet_diag_msg_attrs_fill() function. Also inet_diag_msg_attrs_size() helper was added and addends in nlmsg_new() were reordered to save order from inet_sk_diag_fill(). Fixes: 1ec17dbd90f8 ("inet_diag: fix reporting cgroup classid and fallback to priority") Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 18 ++++++++++++------ net/ipv4/inet_diag.c | 44 ++++++++++++++++++++------------------------ net/ipv4/raw_diag.c | 5 +++-- net/ipv4/udp_diag.c | 5 +++-- net/sctp/diag.c | 8 ++------ 5 files changed, 40 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 39faaaf843e1..c91cf2dee12a 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -2,15 +2,10 @@ #ifndef _INET_DIAG_H_ #define _INET_DIAG_H_ 1 +#include #include -struct net; -struct sock; struct inet_hashinfo; -struct nlattr; -struct nlmsghdr; -struct sk_buff; -struct netlink_callback; struct inet_diag_handler { void (*dump)(struct sk_buff *skb, @@ -62,6 +57,17 @@ int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk); +static inline size_t inet_diag_msg_attrs_size(void) +{ + return nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + + nla_total_size(1) /* INET_DIAG_TOS */ +#if IS_ENABLED(CONFIG_IPV6) + + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(1) /* INET_DIAG_SKV6ONLY */ +#endif + + nla_total_size(4) /* INET_DIAG_MARK */ + + nla_total_size(4); /* INET_DIAG_CLASS_ID */ +} int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, struct user_namespace *user_ns, bool net_admin); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f11e997e517b..8c8377568a78 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -100,13 +100,9 @@ static size_t inet_sk_attr_size(struct sock *sk, aux = handler->idiag_get_aux_size(sk, net_admin); return nla_total_size(sizeof(struct tcp_info)) - + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ - + nla_total_size(1) /* INET_DIAG_TOS */ - + nla_total_size(1) /* INET_DIAG_TCLASS */ - + nla_total_size(4) /* INET_DIAG_MARK */ - + nla_total_size(4) /* INET_DIAG_CLASS_ID */ - + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + nla_total_size(TCP_CA_NAME_MAX) + nla_total_size(sizeof(struct tcpvegas_info)) @@ -147,6 +143,24 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark)) goto errout; + if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || + ext & (1 << (INET_DIAG_TCLASS - 1))) { + u32 classid = 0; + +#ifdef CONFIG_SOCK_CGROUP_DATA + classid = sock_cgroup_classid(&sk->sk_cgrp_data); +#endif + /* Fallback to socket priority if class id isn't set. + * Classful qdiscs use it as direct reference to class. + * For cgroup2 classid is always zero. + */ + if (!classid) + classid = sk->sk_priority; + + if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) + goto errout; + } + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); @@ -284,24 +298,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto errout; } - if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || - ext & (1 << (INET_DIAG_TCLASS - 1))) { - u32 classid = 0; - -#ifdef CONFIG_SOCK_CGROUP_DATA - classid = sock_cgroup_classid(&sk->sk_cgrp_data); -#endif - /* Fallback to socket priority if class id isn't set. - * Classful qdiscs use it as direct reference to class. - * For cgroup2 classid is always zero. - */ - if (!classid) - classid = sk->sk_priority; - - if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) - goto errout; - } - out: nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index e35736b99300..a93e7d1e1251 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -100,8 +100,9 @@ static int raw_diag_dump_one(struct sk_buff *in_skb, if (IS_ERR(sk)) return PTR_ERR(sk); - rep = nlmsg_new(sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + 64, + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) { sock_put(sk); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 910555a4d9fe..dccd2286bc28 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -64,8 +64,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, goto out; err = -ENOMEM; - rep = nlmsg_new(sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + 64, + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 8a15146faaeb..1069d7af3672 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -237,15 +237,11 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc) addrcnt++; return nla_total_size(sizeof(struct sctp_info)) - + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ - + nla_total_size(1) /* INET_DIAG_TOS */ - + nla_total_size(1) /* INET_DIAG_TCLASS */ - + nla_total_size(4) /* INET_DIAG_MARK */ - + nla_total_size(4) /* INET_DIAG_CLASS_ID */ + nla_total_size(addrlen * asoc->peer.transport_count) + nla_total_size(addrlen * addrcnt) - + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64; } -- cgit v1.2.3 From 018d26fcd12a75fb9b5fe233762aa3f2f0854b88 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 5 Mar 2020 17:45:57 +0300 Subject: cgroup, netclassid: periodically release file_lock on classid updating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In our production environment we have faced with problem that updating classid in cgroup with heavy tasks cause long freeze of the file tables in this tasks. By heavy tasks we understand tasks with many threads and opened sockets (e.g. balancers). This freeze leads to an increase number of client timeouts. This patch implements following logic to fix this issue: аfter iterating 1000 file descriptors file table lock will be released thus providing a time gap for socket creation/deletion. Now update is non atomic and socket may be skipped using calls: dup2(oldfd, newfd); close(oldfd); But this case is not typical. Moreover before this patch skip is possible too by hiding socket fd in unix socket buffer. New sockets will be allocated with updated classid because cgroup state is updated before start of the file descriptors iteration. So in common cases this patch has no side effects. Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/core/netclassid_cgroup.c | 47 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 0642f91c4038..b4c87fe31be2 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -53,30 +53,60 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) kfree(css_cls_state(css)); } +/* + * To avoid freezing of sockets creation for tasks with big number of threads + * and opened sockets lets release file_lock every 1000 iterated descriptors. + * New sockets will already have been created with new classid. + */ + +struct update_classid_context { + u32 classid; + unsigned int batch; +}; + +#define UPDATE_CLASSID_BATCH 1000 + static int update_classid_sock(const void *v, struct file *file, unsigned n) { int err; + struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file, &err); if (sock) { spin_lock(&cgroup_sk_update_lock); - sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, - (unsigned long)v); + sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); spin_unlock(&cgroup_sk_update_lock); } + if (--ctx->batch == 0) { + ctx->batch = UPDATE_CLASSID_BATCH; + return n + 1; + } return 0; } +static void update_classid_task(struct task_struct *p, u32 classid) +{ + struct update_classid_context ctx = { + .classid = classid, + .batch = UPDATE_CLASSID_BATCH + }; + unsigned int fd = 0; + + do { + task_lock(p); + fd = iterate_fd(p->files, fd, update_classid_sock, &ctx); + task_unlock(p); + cond_resched(); + } while (fd); +} + static void cgrp_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct task_struct *p; cgroup_taskset_for_each(p, css, tset) { - task_lock(p); - iterate_fd(p->files, 0, update_classid_sock, - (void *)(unsigned long)css_cls_state(css)->classid); - task_unlock(p); + update_classid_task(p, css_cls_state(css)->classid); } } @@ -98,10 +128,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, css_task_iter_start(css, 0, &it); while ((p = css_task_iter_next(&it))) { - task_lock(p); - iterate_fd(p->files, 0, update_classid_sock, - (void *)(unsigned long)cs->classid); - task_unlock(p); + update_classid_task(p, cs->classid); cond_resched(); } css_task_iter_end(&it); -- cgit v1.2.3 From d752a4986532cb6305dfd5290a614cde8072769d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 9 Mar 2020 22:16:06 -0700 Subject: net: memcg: late association of sock to memcg If a TCP socket is allocated in IRQ context or cloned from unassociated (i.e. not associated to a memcg) in IRQ context then it will remain unassociated for its whole life. Almost half of the TCPs created on the system are created in IRQ context, so, memory used by such sockets will not be accounted by the memcg. This issue is more widespread in cgroup v1 where network memory accounting is opt-in but it can happen in cgroup v2 if the source socket for the cloning was created in root memcg. To fix the issue, just do the association of the sockets at the accept() time in the process context and then force charge the memory buffer already used and reserved by the socket. Signed-off-by: Shakeel Butt Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- mm/memcontrol.c | 14 -------------- net/core/sock.c | 5 ++++- net/ipv4/inet_connection_sock.c | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1cf41ee22258..2058b8da18db 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6682,20 +6682,6 @@ void mem_cgroup_sk_alloc(struct sock *sk) if (!mem_cgroup_sockets_enabled) return; - /* - * Socket cloning can throw us here with sk_memcg already - * filled. It won't however, necessarily happen from - * process context. So the test for root memcg given - * the current task's memcg won't help us in this case. - * - * Respecting the original socket's memcg is a better - * decision in this case. - */ - if (sk->sk_memcg) { - css_get(&sk->sk_memcg->css); - return; - } - /* Do not associate the sock with unrelated interrupted task's memcg. */ if (in_interrupt()) return; diff --git a/net/core/sock.c b/net/core/sock.c index a4c8fac781ff..8f71684305c3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1830,7 +1830,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); - mem_cgroup_sk_alloc(newsk); + + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; + cgroup_sk_alloc(&newsk->sk_cgrp_data); rcu_read_lock(); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a4db79b1b643..65a3b2565102 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -482,6 +482,26 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) } spin_unlock_bh(&queue->fastopenq.lock); } + + if (mem_cgroup_sockets_enabled) { + int amt; + + /* atomically get the memory usage, set and charge the + * sk->sk_memcg. + */ + lock_sock(newsk); + + /* The sk has not been accepted yet, no need to look at + * sk->sk_wmem_queued. + */ + amt = sk_mem_pages(newsk->sk_forward_alloc + + atomic_read(&sk->sk_rmem_alloc)); + mem_cgroup_sk_alloc(newsk); + if (newsk->sk_memcg && amt) + mem_cgroup_charge_skmem(newsk->sk_memcg, amt); + + release_sock(newsk); + } out: release_sock(sk); if (req) -- cgit v1.2.3 From 60380488e4e0b95e9e82aa68aa9705baa86de84c Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 10 Mar 2020 15:27:37 +0800 Subject: ipv6/addrconf: call ipv6_mc_up() for non-Ethernet interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rafał found an issue that for non-Ethernet interface, if we down and up frequently, the memory will be consumed slowly. The reason is we add allnodes/allrouters addressed in multicast list in ipv6_add_dev(). When link down, we call ipv6_mc_down(), store all multicast addresses via mld_add_delrec(). But when link up, we don't call ipv6_mc_up() for non-Ethernet interface to remove the addresses. This makes idev->mc_tomb getting bigger and bigger. The call stack looks like: addrconf_notify(NETDEV_REGISTER) ipv6_add_dev ipv6_dev_mc_inc(ff01::1) ipv6_dev_mc_inc(ff02::1) ipv6_dev_mc_inc(ff02::2) addrconf_notify(NETDEV_UP) addrconf_dev_config /* Alas, we support only Ethernet autoconfiguration. */ return; addrconf_notify(NETDEV_DOWN) addrconf_ifdown ipv6_mc_down igmp6_group_dropped(ff02::2) mld_add_delrec(ff02::2) igmp6_group_dropped(ff02::1) igmp6_group_dropped(ff01::1) After investigating, I can't found a rule to disable multicast on non-Ethernet interface. In RFC2460, the link could be Ethernet, PPP, ATM, tunnels, etc. In IPv4, it doesn't check the dev type when calls ip_mc_up() in inetdev_event(). Even for IPv6, we don't check the dev type and call ipv6_add_dev(), ipv6_dev_mc_inc() after register device. So I think it's OK to fix this memory consumer by calling ipv6_mc_up() for non-Ethernet interface. v2: Also check IFF_MULTICAST flag to make sure the interface supports multicast Reported-by: Rafał Miłecki Tested-by: Rafał Miłecki Fixes: 74235a25c673 ("[IPV6] addrconf: Fix IPv6 on tuntap tunnels") Fixes: 1666d49e1d41 ("mld: do not remove mld souce list info when set link down") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e6e1290ea06f..46d614b611db 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3347,6 +3347,10 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_NONE) && (dev->type != ARPHRD_RAWIP)) { /* Alas, we support only Ethernet autoconfiguration. */ + idev = __in6_dev_get(dev); + if (!IS_ERR_OR_NULL(idev) && dev->flags & IFF_UP && + dev->flags & IFF_MULTICAST) + ipv6_mc_up(idev); return; } -- cgit v1.2.3 From ece0d7bd74615773268475b6b64d6f1ebbd4b4c6 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 10 Mar 2020 09:33:30 +0100 Subject: net/smc: cancel event worker during device removal During IB device removal, cancel the event worker before the device structure is freed. Fixes: a4cf0443c414 ("smc: introduce SMC as an IB-client") Reported-by: syzbot+b297c6825752e7a07272@syzkaller.appspotmail.com Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Reviewed-by: Leon Romanovsky Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d6ba186f67e2..05b825b3cfa4 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -582,6 +582,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) smc_smcr_terminate_all(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); + cancel_work_sync(&smcibdev->port_event_work); kfree(smcibdev); } -- cgit v1.2.3 From 0e1a1d853ecedc99da9d27f9f5c376935547a0e2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:10:56 -0800 Subject: nl80211: add missing attribute validation for critical protocol indication Add missing attribute validation for critical protocol fields to the netlink policy. Fixes: 5de17984898c ("cfg80211: introduce critical protocol indication from user-space") Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20200303051058.4089398-2-kuba@kernel.org Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5b19e9fac4aa..cd0e024d7cb6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -531,6 +531,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MDID] = { .type = NLA_U16 }, [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_CRIT_PROT_ID] = { .type = NLA_U16 }, + [NL80211_ATTR_MAX_CRIT_PROT_DURATION] = { .type = NLA_U16 }, [NL80211_ATTR_PEER_AID] = NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID), [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, -- cgit v1.2.3 From 056e9375e1f3c4bf2fd49b70258c7daf788ecd9d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:10:57 -0800 Subject: nl80211: add missing attribute validation for beacon report scanning Add missing attribute validation for beacon report scanning to the netlink policy. Fixes: 1d76250bd34a ("nl80211: support beacon report scanning") Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20200303051058.4089398-3-kuba@kernel.org Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cd0e024d7cb6..48e6508aba52 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -470,6 +470,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED }, [NL80211_ATTR_STA_PLINK_STATE] = NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_STATES - 1), + [NL80211_ATTR_MEASUREMENT_DURATION] = { .type = NLA_U16 }, + [NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY] = { .type = NLA_FLAG }, [NL80211_ATTR_MESH_PEER_AID] = NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID), [NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 }, -- cgit v1.2.3 From 5cde05c61cbe13cbb3fa66d52b9ae84f7975e5e6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:10:58 -0800 Subject: nl80211: add missing attribute validation for channel switch Add missing attribute validation for NL80211_ATTR_OPER_CLASS to the netlink policy. Fixes: 1057d35ede5d ("cfg80211: introduce TDLS channel switch commands") Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20200303051058.4089398-4-kuba@kernel.org Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 48e6508aba52..ec5d67794aab 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -565,6 +565,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_UPS - 1), [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, + [NL80211_ATTR_OPER_CLASS] = { .type = NLA_U8 }, [NL80211_ATTR_MAC_MASK] = { .type = NLA_EXACT_LEN_WARN, .len = ETH_ALEN -- cgit v1.2.3 From ba32679cac50c38fdf488296f96b1f3175532b8e Mon Sep 17 00:00:00 2001 From: Nicolas Cavallari Date: Thu, 5 Mar 2020 15:04:09 +0100 Subject: mac80211: Do not send mesh HWMP PREQ if HWMP is disabled When trying to transmit to an unknown destination, the mesh code would unconditionally transmit a HWMP PREQ even if HWMP is not the current path selection algorithm. Signed-off-by: Nicolas Cavallari Link: https://lore.kernel.org/r/20200305140409.12204-1-cavallar@lri.fr Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index d69983370381..38a0383dfbcf 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1152,7 +1152,8 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, } } - if (!(mpath->flags & MESH_PATH_RESOLVING)) + if (!(mpath->flags & MESH_PATH_RESOLVING) && + mesh_path_sel_is_hwmp(sdata)) mesh_queue_preq(mpath, PREQ_Q_F_START); if (skb_queue_len(&mpath->frame_queue) >= MESH_FRAME_QUEUE_LEN) -- cgit v1.2.3 From 90db6d772f749e38171d04619a5e3cd8804a6d02 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 10 Mar 2020 09:41:48 -0700 Subject: bpf, sockmap: Remove bucket->lock from sock_{hash|map}_free The bucket->lock is not needed in the sock_hash_free and sock_map_free calls, in fact it is causing a splat due to being inside rcu block. | BUG: sleeping function called from invalid context at net/core/sock.c:2935 | in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 62, name: kworker/0:1 | 3 locks held by kworker/0:1/62: | #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0 | #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0 | #2: ffff8881381f6df8 (&stab->lock){+...}, at: sock_map_free+0x26/0x180 | CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04008-g7b083332376e #454 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x71/0xa0 | ___might_sleep.cold+0xa6/0xb6 | lock_sock_nested+0x28/0x90 | sock_map_free+0x5f/0x180 | bpf_map_free_deferred+0x58/0x80 | process_one_work+0x260/0x5e0 | worker_thread+0x4d/0x3e0 | kthread+0x108/0x140 | ? process_one_work+0x5e0/0x5e0 | ? kthread_park+0x90/0x90 | ret_from_fork+0x3a/0x50 The reason we have stab->lock and bucket->locks in sockmap code is to handle checking EEXIST in update/delete cases. We need to be careful during an update operation that we check for EEXIST and we need to ensure that the psock object is not in some partial state of removal/insertion while we do this. So both map_update_common and sock_map_delete need to guard from being run together potentially deleting an entry we are checking, etc. But by the time we get to the tear-down code in sock_{ma[|hash}_free we have already disconnected the map and we just did synchronize_rcu() in the line above so no updates/deletes should be in flight. Because of this we can drop the bucket locks from the map free'ing code, noting no update/deletes can be in-flight. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: Jakub Sitnicki Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/158385850787.30597.8346421465837046618.stgit@john-Precision-5820-Tower --- net/core/sock_map.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 085cef5857bb..b70c844a88ec 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -233,8 +233,11 @@ static void sock_map_free(struct bpf_map *map) struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ synchronize_rcu(); - raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; @@ -248,7 +251,6 @@ static void sock_map_free(struct bpf_map *map) release_sock(sk); } } - raw_spin_unlock_bh(&stab->lock); /* wait for psock readers accessing its map link */ synchronize_rcu(); @@ -863,10 +865,13 @@ static void sock_hash_free(struct bpf_map *map) struct hlist_node *node; int i; + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); - raw_spin_lock_bh(&bucket->lock); hlist_for_each_entry_safe(elem, node, &bucket->head, node) { hlist_del_rcu(&elem->node); lock_sock(elem->sk); @@ -875,7 +880,6 @@ static void sock_hash_free(struct bpf_map *map) rcu_read_unlock(); release_sock(elem->sk); } - raw_spin_unlock_bh(&bucket->lock); } /* wait for psock readers accessing its map link */ -- cgit v1.2.3 From f9fc28a8de2fb367f1ab76f0cf176ca545db3d6f Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Thu, 12 Mar 2020 11:04:20 +0530 Subject: net: caif: Add lockdep expression to RCU traversal primitive caifdevs->list is traversed using list_for_each_entry_rcu() outside an RCU read-side critical section but under the protection of rtnl_mutex. Hence, add the corresponding lockdep expression to silence the following false-positive warning: [ 10.868467] ============================= [ 10.869082] WARNING: suspicious RCU usage [ 10.869817] 5.6.0-rc1-00177-g06ec0a154aae4 #1 Not tainted [ 10.870804] ----------------------------- [ 10.871557] net/caif/caif_dev.c:115 RCU-list traversed in non-reader section!! Reported-by: kernel test robot Signed-off-by: Amol Grover Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 03c7cdd8e4cb..195d2d67be8a 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -112,7 +112,8 @@ static struct caif_device_entry *caif_get(struct net_device *dev) caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; - list_for_each_entry_rcu(caifd, &caifdevs->list, list) { + list_for_each_entry_rcu(caifd, &caifdevs->list, list, + lockdep_rtnl_is_held()) { if (caifd->netdev == dev) return caifd; } -- cgit v1.2.3 From 46e4c421a053c36bf7a33dda2272481bcaf3eed3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 9 Mar 2020 11:34:35 -0400 Subject: net/packet: tpacket_rcv: do not increment ring index on drop In one error case, tpacket_rcv drops packets after incrementing the ring producer index. If this happens, it does not update tp_status to TP_STATUS_USER and thus the reader is stalled for an iteration of the ring, causing out of order arrival. The only such error path is when virtio_net_hdr_from_skb fails due to encountering an unknown GSO type. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 30c6879d6774..e5b0986215d2 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2274,6 +2274,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) goto drop_n_account; + + if (do_vnet && + virtio_net_hdr_from_skb(skb, h.raw + macoff - + sizeof(struct virtio_net_hdr), + vio_le(), true, 0)) + goto drop_n_account; + if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* @@ -2286,12 +2293,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, status |= TP_STATUS_LOSING; } - if (do_vnet && - virtio_net_hdr_from_skb(skb, h.raw + macoff - - sizeof(struct virtio_net_hdr), - vio_le(), true, 0)) - goto drop_n_account; - po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; -- cgit v1.2.3 From a20f997010c4ec76eaa55b8cc047d76dcac69f70 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 11 Mar 2020 16:24:24 +0100 Subject: net: dsa: Don't instantiate phylink for CPU/DSA ports unless needed By default, DSA drivers should configure CPU and DSA ports to their maximum speed. In many configurations this is sufficient to make the link work. In some cases it is necessary to configure the link to run slower, e.g. because of limitations of the SoC it is connected to. Or back to back PHYs are used and the PHY needs to be driven in order to establish link. In this case, phylink is used. Only instantiate phylink if it is required. If there is no PHY, or no fixed link properties, phylink can upset a link which works in the default configuration. Fixes: 0e27921816ad ("net: dsa: Use PHYLINK for the CPU/DSA ports") Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/port.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/port.c b/net/dsa/port.c index ed7dabb57985..ec13dc666788 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -648,9 +648,14 @@ err_phy_connect: int dsa_port_link_register_of(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; + struct device_node *phy_np; - if (!ds->ops->adjust_link) - return dsa_port_phylink_register(dp); + if (!ds->ops->adjust_link) { + phy_np = of_parse_phandle(dp->dn, "phy-handle", 0); + if (of_phy_is_fixed_link(dp->dn) || phy_np) + return dsa_port_phylink_register(dp); + return 0; + } dev_warn(ds->dev, "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n"); @@ -665,11 +670,12 @@ void dsa_port_link_unregister_of(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; - if (!ds->ops->adjust_link) { + if (!ds->ops->adjust_link && dp->pl) { rtnl_lock(); phylink_disconnect_phy(dp->pl); rtnl_unlock(); phylink_destroy(dp->pl); + dp->pl = NULL; return; } -- cgit v1.2.3 From 2677625387056136e256c743e3285b4fe3da87bb Mon Sep 17 00:00:00 2001 From: Paolo Lungaroni Date: Wed, 11 Mar 2020 17:54:06 +0100 Subject: seg6: fix SRv6 L2 tunnels to use IANA-assigned protocol number The Internet Assigned Numbers Authority (IANA) has recently assigned a protocol number value of 143 for Ethernet [1]. Before this assignment, encapsulation mechanisms such as Segment Routing used the IPv6-NoNxt protocol number (59) to indicate that the encapsulated payload is an Ethernet frame. In this patch, we add the definition of the Ethernet protocol number to the kernel headers and update the SRv6 L2 tunnels to use it. [1] https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml Signed-off-by: Paolo Lungaroni Reviewed-by: Andrea Mayer Acked-by: Ahmed Abdelsalam Signed-off-by: David S. Miller --- include/uapi/linux/in.h | 2 ++ net/ipv6/seg6_iptunnel.c | 2 +- net/ipv6/seg6_local.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 1521073b6348..8533bf07450f 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -74,6 +74,8 @@ enum { #define IPPROTO_UDPLITE IPPROTO_UDPLITE IPPROTO_MPLS = 137, /* MPLS in IP (RFC 4023) */ #define IPPROTO_MPLS IPPROTO_MPLS + IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */ +#define IPPROTO_ETHERNET IPPROTO_ETHERNET IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW IPPROTO_MPTCP = 262, /* Multipath TCP connection */ diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index ab7f124ff5d7..8c52efe299cc 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -268,7 +268,7 @@ static int seg6_do_srh(struct sk_buff *skb) skb_mac_header_rebuild(skb); skb_push(skb, skb->mac_len); - err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE); + err = seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET); if (err) return err; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 7cbc19731997..8165802d8e05 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -282,7 +282,7 @@ static int input_action_end_dx2(struct sk_buff *skb, struct net_device *odev; struct ethhdr *eth; - if (!decap_and_validate(skb, NEXTHDR_NONE)) + if (!decap_and_validate(skb, IPPROTO_ETHERNET)) goto drop; if (!pskb_may_pull(skb, ETH_HLEN)) -- cgit v1.2.3 From 06669ea346e476a5339033d77ef175566a40efbb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Mar 2020 11:44:26 -0700 Subject: net: memcg: fix lockdep splat in inet_csk_accept() Locking newsk while still holding the listener lock triggered a lockdep splat [1] We can simply move the memcg code after we release the listener lock, as this can also help if multiple threads are sharing a common listener. Also fix a typo while reading socket sk_rmem_alloc. [1] WARNING: possible recursive locking detected 5.6.0-rc3-syzkaller #0 Not tainted -------------------------------------------- syz-executor598/9524 is trying to acquire lock: ffff88808b5b8b90 (sk_lock-AF_INET6){+.+.}, at: lock_sock include/net/sock.h:1541 [inline] ffff88808b5b8b90 (sk_lock-AF_INET6){+.+.}, at: inet_csk_accept+0x69f/0xd30 net/ipv4/inet_connection_sock.c:492 but task is already holding lock: ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: lock_sock include/net/sock.h:1541 [inline] ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: inet_csk_accept+0x8d/0xd30 net/ipv4/inet_connection_sock.c:445 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(sk_lock-AF_INET6); lock(sk_lock-AF_INET6); *** DEADLOCK *** May be due to missing lock nesting notation 1 lock held by syz-executor598/9524: #0: ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: lock_sock include/net/sock.h:1541 [inline] #0: ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: inet_csk_accept+0x8d/0xd30 net/ipv4/inet_connection_sock.c:445 stack backtrace: CPU: 0 PID: 9524 Comm: syz-executor598 Not tainted 5.6.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x188/0x20d lib/dump_stack.c:118 print_deadlock_bug kernel/locking/lockdep.c:2370 [inline] check_deadlock kernel/locking/lockdep.c:2411 [inline] validate_chain kernel/locking/lockdep.c:2954 [inline] __lock_acquire.cold+0x114/0x288 kernel/locking/lockdep.c:3954 lock_acquire+0x197/0x420 kernel/locking/lockdep.c:4484 lock_sock_nested+0xc5/0x110 net/core/sock.c:2947 lock_sock include/net/sock.h:1541 [inline] inet_csk_accept+0x69f/0xd30 net/ipv4/inet_connection_sock.c:492 inet_accept+0xe9/0x7c0 net/ipv4/af_inet.c:734 __sys_accept4_file+0x3ac/0x5b0 net/socket.c:1758 __sys_accept4+0x53/0x90 net/socket.c:1809 __do_sys_accept4 net/socket.c:1821 [inline] __se_sys_accept4 net/socket.c:1818 [inline] __x64_sys_accept4+0x93/0xf0 net/socket.c:1818 do_syscall_64+0xf6/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4445c9 Code: e8 0c 0d 03 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffc35b37608 EFLAGS: 00000246 ORIG_RAX: 0000000000000120 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004445c9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000306777 R09: 0000000000306777 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00000000004053d0 R14: 0000000000000000 R15: 0000000000000000 Fixes: d752a4986532 ("net: memcg: late association of sock to memcg") Signed-off-by: Eric Dumazet Cc: Shakeel Butt Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 65a3b2565102..d545fb99a8a1 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -483,27 +483,27 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) spin_unlock_bh(&queue->fastopenq.lock); } - if (mem_cgroup_sockets_enabled) { +out: + release_sock(sk); + if (newsk && mem_cgroup_sockets_enabled) { int amt; /* atomically get the memory usage, set and charge the - * sk->sk_memcg. + * newsk->sk_memcg. */ lock_sock(newsk); - /* The sk has not been accepted yet, no need to look at - * sk->sk_wmem_queued. + /* The socket has not been accepted yet, no need to look at + * newsk->sk_wmem_queued. */ amt = sk_mem_pages(newsk->sk_forward_alloc + - atomic_read(&sk->sk_rmem_alloc)); + atomic_read(&newsk->sk_rmem_alloc)); mem_cgroup_sk_alloc(newsk); if (newsk->sk_memcg && amt) mem_cgroup_charge_skmem(newsk->sk_memcg, amt); release_sock(newsk); } -out: - release_sock(sk); if (req) reqsk_put(req); return newsk; -- cgit v1.2.3 From b09fe70ef520e011ba4a64f4b93f948a8f14717b Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 9 Mar 2020 10:39:53 -0700 Subject: taprio: Fix sending packets without dequeueing them There was a bug that was causing packets to be sent to the driver without first calling dequeue() on the "child" qdisc. And the KASAN report below shows that sending a packet without calling dequeue() leads to bad results. The problem is that when checking the last qdisc "child" we do not set the returned skb to NULL, which can cause it to be sent to the driver, and so after the skb is sent, it may be freed, and in some situations a reference to it may still be in the child qdisc, because it was never dequeued. The crash log looks like this: [ 19.937538] ================================================================== [ 19.938300] BUG: KASAN: use-after-free in taprio_dequeue_soft+0x620/0x780 [ 19.938968] Read of size 4 at addr ffff8881128628cc by task swapper/1/0 [ 19.939612] [ 19.939772] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.6.0-rc3+ #97 [ 19.940397] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qe4 [ 19.941523] Call Trace: [ 19.941774] [ 19.941985] dump_stack+0x97/0xe0 [ 19.942323] print_address_description.constprop.0+0x3b/0x60 [ 19.942884] ? taprio_dequeue_soft+0x620/0x780 [ 19.943325] ? taprio_dequeue_soft+0x620/0x780 [ 19.943767] __kasan_report.cold+0x1a/0x32 [ 19.944173] ? taprio_dequeue_soft+0x620/0x780 [ 19.944612] kasan_report+0xe/0x20 [ 19.944954] taprio_dequeue_soft+0x620/0x780 [ 19.945380] __qdisc_run+0x164/0x18d0 [ 19.945749] net_tx_action+0x2c4/0x730 [ 19.946124] __do_softirq+0x268/0x7bc [ 19.946491] irq_exit+0x17d/0x1b0 [ 19.946824] smp_apic_timer_interrupt+0xeb/0x380 [ 19.947280] apic_timer_interrupt+0xf/0x20 [ 19.947687] [ 19.947912] RIP: 0010:default_idle+0x2d/0x2d0 [ 19.948345] Code: 00 00 41 56 41 55 65 44 8b 2d 3f 8d 7c 7c 41 54 55 53 0f 1f 44 00 00 e8 b1 b2 c5 fd e9 07 00 3 [ 19.950166] RSP: 0018:ffff88811a3efda0 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13 [ 19.950909] RAX: 0000000080000000 RBX: ffff88811a3a9600 RCX: ffffffff8385327e [ 19.951608] RDX: 1ffff110234752c0 RSI: 0000000000000000 RDI: ffffffff8385262f [ 19.952309] RBP: ffffed10234752c0 R08: 0000000000000001 R09: ffffed10234752c1 [ 19.953009] R10: ffffed10234752c0 R11: ffff88811a3a9607 R12: 0000000000000001 [ 19.953709] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000 [ 19.954408] ? default_idle_call+0x2e/0x70 [ 19.954816] ? default_idle+0x1f/0x2d0 [ 19.955192] default_idle_call+0x5e/0x70 [ 19.955584] do_idle+0x3d4/0x500 [ 19.955909] ? arch_cpu_idle_exit+0x40/0x40 [ 19.956325] ? _raw_spin_unlock_irqrestore+0x23/0x30 [ 19.956829] ? trace_hardirqs_on+0x30/0x160 [ 19.957242] cpu_startup_entry+0x19/0x20 [ 19.957633] start_secondary+0x2a6/0x380 [ 19.958026] ? set_cpu_sibling_map+0x18b0/0x18b0 [ 19.958486] secondary_startup_64+0xa4/0xb0 [ 19.958921] [ 19.959078] Allocated by task 33: [ 19.959412] save_stack+0x1b/0x80 [ 19.959747] __kasan_kmalloc.constprop.0+0xc2/0xd0 [ 19.960222] kmem_cache_alloc+0xe4/0x230 [ 19.960617] __alloc_skb+0x91/0x510 [ 19.960967] ndisc_alloc_skb+0x133/0x330 [ 19.961358] ndisc_send_ns+0x134/0x810 [ 19.961735] addrconf_dad_work+0xad5/0xf80 [ 19.962144] process_one_work+0x78e/0x13a0 [ 19.962551] worker_thread+0x8f/0xfa0 [ 19.962919] kthread+0x2ba/0x3b0 [ 19.963242] ret_from_fork+0x3a/0x50 [ 19.963596] [ 19.963753] Freed by task 33: [ 19.964055] save_stack+0x1b/0x80 [ 19.964386] __kasan_slab_free+0x12f/0x180 [ 19.964830] kmem_cache_free+0x80/0x290 [ 19.965231] ip6_mc_input+0x38a/0x4d0 [ 19.965617] ipv6_rcv+0x1a4/0x1d0 [ 19.965948] __netif_receive_skb_one_core+0xf2/0x180 [ 19.966437] netif_receive_skb+0x8c/0x3c0 [ 19.966846] br_handle_frame_finish+0x779/0x1310 [ 19.967302] br_handle_frame+0x42a/0x830 [ 19.967694] __netif_receive_skb_core+0xf0e/0x2a90 [ 19.968167] __netif_receive_skb_one_core+0x96/0x180 [ 19.968658] process_backlog+0x198/0x650 [ 19.969047] net_rx_action+0x2fa/0xaa0 [ 19.969420] __do_softirq+0x268/0x7bc [ 19.969785] [ 19.969940] The buggy address belongs to the object at ffff888112862840 [ 19.969940] which belongs to the cache skbuff_head_cache of size 224 [ 19.971202] The buggy address is located 140 bytes inside of [ 19.971202] 224-byte region [ffff888112862840, ffff888112862920) [ 19.972344] The buggy address belongs to the page: [ 19.972820] page:ffffea00044a1800 refcount:1 mapcount:0 mapping:ffff88811a2bd1c0 index:0xffff8881128625c0 compo0 [ 19.973930] flags: 0x8000000000010200(slab|head) [ 19.974388] raw: 8000000000010200 ffff88811a2ed650 ffff88811a2ed650 ffff88811a2bd1c0 [ 19.975151] raw: ffff8881128625c0 0000000000190013 00000001ffffffff 0000000000000000 [ 19.975915] page dumped because: kasan: bad access detected [ 19.976461] page_owner tracks the page as allocated [ 19.976946] page last allocated via order 2, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NO) [ 19.978332] prep_new_page+0x24b/0x330 [ 19.978707] get_page_from_freelist+0x2057/0x2c90 [ 19.979170] __alloc_pages_nodemask+0x218/0x590 [ 19.979619] new_slab+0x9d/0x300 [ 19.979948] ___slab_alloc.constprop.0+0x2f9/0x6f0 [ 19.980421] __slab_alloc.constprop.0+0x30/0x60 [ 19.980870] kmem_cache_alloc+0x201/0x230 [ 19.981269] __alloc_skb+0x91/0x510 [ 19.981620] alloc_skb_with_frags+0x78/0x4a0 [ 19.982043] sock_alloc_send_pskb+0x5eb/0x750 [ 19.982476] unix_stream_sendmsg+0x399/0x7f0 [ 19.982904] sock_sendmsg+0xe2/0x110 [ 19.983262] ____sys_sendmsg+0x4de/0x6d0 [ 19.983660] ___sys_sendmsg+0xe4/0x160 [ 19.984032] __sys_sendmsg+0xab/0x130 [ 19.984396] do_syscall_64+0xe7/0xae0 [ 19.984761] page last free stack trace: [ 19.985142] __free_pages_ok+0x432/0xbc0 [ 19.985533] qlist_free_all+0x56/0xc0 [ 19.985907] quarantine_reduce+0x149/0x170 [ 19.986315] __kasan_kmalloc.constprop.0+0x9e/0xd0 [ 19.986791] kmem_cache_alloc+0xe4/0x230 [ 19.987182] prepare_creds+0x24/0x440 [ 19.987548] do_faccessat+0x80/0x590 [ 19.987906] do_syscall_64+0xe7/0xae0 [ 19.988276] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 19.988775] [ 19.988930] Memory state around the buggy address: [ 19.989402] ffff888112862780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 19.990111] ffff888112862800: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb [ 19.990822] >ffff888112862880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 19.991529] ^ [ 19.992081] ffff888112862900: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc [ 19.992796] ffff888112862980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc Fixes: 5a781ccbd19e ("tc: Add support for configuring the taprio scheduler") Reported-by: Michael Schmidt Signed-off-by: Vinicius Costa Gomes Acked-by: Andre Guedes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index ee717e05372b..b1eb12d33b9a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -564,8 +564,10 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); - if (!(gate_mask & BIT(tc))) + if (!(gate_mask & BIT(tc))) { + skb = NULL; continue; + } len = qdisc_pkt_len(skb); guard = ktime_add_ns(taprio_get_time(q), @@ -575,13 +577,17 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) * guard band ... */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && - ktime_after(guard, entry->close_time)) + ktime_after(guard, entry->close_time)) { + skb = NULL; continue; + } /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && - atomic_sub_return(len, &entry->budget) < 0) + atomic_sub_return(len, &entry->budget) < 0) { + skb = NULL; continue; + } skb = child->ops->dequeue(child); if (unlikely(!skb)) -- cgit v1.2.3 From 158fe6665389964a1de212818b4a5c52b7f7aff4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Mar 2020 09:05:38 +0000 Subject: rxrpc: Abstract out the calculation of whether there's Tx space Abstract out the calculation of there being sufficient Tx buffer space. This is reproduced several times in the rxrpc sendmsg code. Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 813fd6888142..a13051d38097 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -17,6 +17,21 @@ #include #include "ar-internal.h" +/* + * Return true if there's sufficient Tx queue space. + */ +static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) +{ + unsigned int win_size = + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra); + rxrpc_seq_t tx_win = READ_ONCE(call->tx_hard_ack); + + if (_tx_win) + *_tx_win = tx_win; + return call->tx_top - tx_win < win_size; +} + /* * Wait for space to appear in the Tx queue or a signal to occur. */ @@ -26,9 +41,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, { for (;;) { set_current_state(TASK_INTERRUPTIBLE); - if (call->tx_top - call->tx_hard_ack < - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra)) + if (rxrpc_check_tx_space(call, NULL)) return 0; if (call->state >= RXRPC_CALL_COMPLETE) @@ -68,9 +81,7 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, set_current_state(TASK_UNINTERRUPTIBLE); tx_win = READ_ONCE(call->tx_hard_ack); - if (call->tx_top - tx_win < - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra)) + if (rxrpc_check_tx_space(call, &tx_win)) return 0; if (call->state >= RXRPC_CALL_COMPLETE) @@ -302,9 +313,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, _debug("alloc"); - if (call->tx_top - call->tx_hard_ack >= - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra)) { + if (!rxrpc_check_tx_space(call, NULL)) { ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; -- cgit v1.2.3 From e138aa7d3271ac1b0690ae2c9b04d51468dce1d6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Mar 2020 09:22:09 +0000 Subject: rxrpc: Fix call interruptibility handling Fix the interruptibility of kernel-initiated client calls so that they're either only interruptible when they're waiting for a call slot to come available or they're not interruptible at all. Either way, they're not interruptible during transmission. This should help prevent StoreData calls from being interrupted when writeback is in progress. It doesn't, however, handle interruption during the receive phase. Userspace-initiated calls are still interruptable. After the signal has been handled, sendmsg() will return the amount of data copied out of the buffer and userspace can perform another sendmsg() call to continue transmission. Fixes: bc5e3a546d55 ("rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals") Signed-off-by: David Howells --- fs/afs/rxrpc.c | 3 ++- include/net/af_rxrpc.h | 8 +++++++- net/rxrpc/af_rxrpc.c | 4 ++-- net/rxrpc/ar-internal.h | 4 ++-- net/rxrpc/call_object.c | 3 +-- net/rxrpc/conn_client.c | 13 ++++++++++--- net/rxrpc/sendmsg.c | 44 ++++++++++++++++++++++++++++++++++++-------- 7 files changed, 60 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 58d396592250..4c28712bb7f6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -413,7 +413,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) afs_wake_up_async_call : afs_wake_up_call_waiter), call->upgrade, - call->intr, + (call->intr ? RXRPC_PREINTERRUPTIBLE : + RXRPC_UNINTERRUPTIBLE), call->debug_id); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 1abae3c340a5..8e547b4d88c8 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -16,6 +16,12 @@ struct sock; struct socket; struct rxrpc_call; +enum rxrpc_interruptibility { + RXRPC_INTERRUPTIBLE, /* Call is interruptible */ + RXRPC_PREINTERRUPTIBLE, /* Call can be cancelled whilst waiting for a slot */ + RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */ +}; + /* * Debug ID counter for tracing. */ @@ -41,7 +47,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, gfp_t, rxrpc_notify_rx_t, bool, - bool, + enum rxrpc_interruptibility, unsigned int); int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index fe42f986cd94..7603cf811f75 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -285,7 +285,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, gfp_t gfp, rxrpc_notify_rx_t notify_rx, bool upgrade, - bool intr, + enum rxrpc_interruptibility interruptibility, unsigned int debug_id) { struct rxrpc_conn_parameters cp; @@ -310,7 +310,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, memset(&p, 0, sizeof(p)); p.user_call_ID = user_call_ID; p.tx_total_len = tx_total_len; - p.intr = intr; + p.interruptibility = interruptibility; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7d730c438404..1f72f43b082d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -489,7 +489,6 @@ enum rxrpc_call_flag { RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */ - RXRPC_CALL_IS_INTR, /* The call is interruptible */ RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */ }; @@ -598,6 +597,7 @@ struct rxrpc_call { atomic_t usage; u16 service_id; /* service ID */ u8 security_ix; /* Security type */ + enum rxrpc_interruptibility interruptibility; /* At what point call may be interrupted */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ int debug_id; /* debug ID for printks */ @@ -721,7 +721,7 @@ struct rxrpc_call_params { u32 normal; /* Max time since last call packet (msec) */ } timeouts; u8 nr_timeouts; /* Number of timeouts specified */ - bool intr; /* The call is interruptible */ + enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */ }; struct rxrpc_send_params { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index c9f34b0a11df..f07970207b54 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -237,8 +237,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, return call; } - if (p->intr) - __set_bit(RXRPC_CALL_IS_INTR, &call->flags); + call->interruptibility = p->interruptibility; call->tx_total_len = p->tx_total_len; trace_rxrpc_call(call->debug_id, rxrpc_call_new_client, atomic_read(&call->usage), diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index ea7d4c21f889..f2a1a5dbb5a7 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -655,13 +655,20 @@ static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp) add_wait_queue_exclusive(&call->waitq, &myself); for (;;) { - if (test_bit(RXRPC_CALL_IS_INTR, &call->flags)) + switch (call->interruptibility) { + case RXRPC_INTERRUPTIBLE: + case RXRPC_PREINTERRUPTIBLE: set_current_state(TASK_INTERRUPTIBLE); - else + break; + case RXRPC_UNINTERRUPTIBLE: + default: set_current_state(TASK_UNINTERRUPTIBLE); + break; + } if (call->call_id) break; - if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) && + if ((call->interruptibility == RXRPC_INTERRUPTIBLE || + call->interruptibility == RXRPC_PREINTERRUPTIBLE) && signal_pending(current)) { ret = -ERESTARTSYS; break; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index a13051d38097..1eccfb92c9e1 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -62,7 +62,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, * Wait for space to appear in the Tx queue uninterruptibly, but with * a timeout of 2*RTT if no progress was made and a signal occurred. */ -static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, +static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, struct rxrpc_call *call) { rxrpc_seq_t tx_start, tx_win; @@ -87,8 +87,7 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, if (call->state >= RXRPC_CALL_COMPLETE) return call->error; - if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) && - timeout == 0 && + if (timeout == 0 && tx_win == tx_start && signal_pending(current)) return -EINTR; @@ -102,6 +101,26 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, } } +/* + * Wait for space to appear in the Tx queue uninterruptibly. + */ +static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, + struct rxrpc_call *call, + long *timeo) +{ + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (rxrpc_check_tx_space(call, NULL)) + return 0; + + if (call->state >= RXRPC_CALL_COMPLETE) + return call->error; + + trace_rxrpc_transmit(call, rxrpc_transmit_wait); + *timeo = schedule_timeout(*timeo); + } +} + /* * wait for space to appear in the transmit/ACK window * - caller holds the socket locked @@ -119,10 +138,19 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, add_wait_queue(&call->waitq, &myself); - if (waitall) - ret = rxrpc_wait_for_tx_window_nonintr(rx, call); - else - ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); + switch (call->interruptibility) { + case RXRPC_INTERRUPTIBLE: + if (waitall) + ret = rxrpc_wait_for_tx_window_waitall(rx, call); + else + ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); + break; + case RXRPC_PREINTERRUPTIBLE: + case RXRPC_UNINTERRUPTIBLE: + default: + ret = rxrpc_wait_for_tx_window_nonintr(rx, call, timeo); + break; + } remove_wait_queue(&call->waitq, &myself); set_current_state(TASK_RUNNING); @@ -628,7 +656,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) .call.tx_total_len = -1, .call.user_call_ID = 0, .call.nr_timeouts = 0, - .call.intr = true, + .call.interruptibility = RXRPC_INTERRUPTIBLE, .abort_code = 0, .command = RXRPC_CMD_SEND_DATA, .exclusive = false, -- cgit v1.2.3 From 498b577660f08cef5d9e78e0ed6dcd4c0939e98c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Mar 2020 17:30:27 +0000 Subject: rxrpc: Fix sendmsg(MSG_WAITALL) handling Fix the handling of sendmsg() with MSG_WAITALL for userspace to round the timeout for when a signal occurs up to at least two jiffies as a 1 jiffy timeout may end up being effectively 0 if jiffies wraps at the wrong time. Fixes: bc5e3a546d55 ("rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals") Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 1eccfb92c9e1..0fcf157aa09f 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -71,8 +71,8 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, rtt = READ_ONCE(call->peer->rtt); rtt2 = nsecs_to_jiffies64(rtt) * 2; - if (rtt2 < 1) - rtt2 = 1; + if (rtt2 < 2) + rtt2 = 2; timeout = rtt2; tx_start = READ_ONCE(call->tx_hard_ack); -- cgit v1.2.3 From 7d7587db0d7fd1138f2afcffdc46a8e15630b944 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Mar 2020 21:40:06 +0000 Subject: afs: Fix client call Rx-phase signal handling Fix the handling of signals in client rxrpc calls made by the afs filesystem. Ignore signals completely, leaving call abandonment or connection loss to be detected by timeouts inside AF_RXRPC. Allowing a filesystem call to be interrupted after the entire request has been transmitted and an abort sent means that the server may or may not have done the action - and we don't know. It may even be worse than that for older servers. Fixes: bc5e3a546d55 ("rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals") Signed-off-by: David Howells --- fs/afs/rxrpc.c | 34 ++-------------------------------- include/net/af_rxrpc.h | 4 +--- net/rxrpc/af_rxrpc.c | 33 +++------------------------------ net/rxrpc/ar-internal.h | 1 - net/rxrpc/input.c | 1 - 5 files changed, 6 insertions(+), 67 deletions(-) (limited to 'net') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 972e3aafa361..1ecc67da6c1a 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -604,11 +604,7 @@ call_complete: long afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac) { - signed long rtt2, timeout; long ret; - bool stalled = false; - u64 rtt; - u32 life, last_life; bool rxrpc_complete = false; DECLARE_WAITQUEUE(myself, current); @@ -619,14 +615,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call, if (ret < 0) goto out; - rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); - rtt2 = nsecs_to_jiffies64(rtt) * 2; - if (rtt2 < 2) - rtt2 = 2; - - timeout = rtt2; - rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life); - add_wait_queue(&call->waitq, &myself); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -637,37 +625,19 @@ long afs_wait_for_call_to_complete(struct afs_call *call, call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); - timeout = rtt2; continue; } if (afs_check_call_state(call, AFS_CALL_COMPLETE)) break; - if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) { + if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { /* rxrpc terminated the call. */ rxrpc_complete = true; break; } - if (call->intr && timeout == 0 && - life == last_life && signal_pending(current)) { - if (stalled) - break; - __set_current_state(TASK_RUNNING); - rxrpc_kernel_probe_life(call->net->socket, call->rxcall); - timeout = rtt2; - stalled = true; - continue; - } - - if (life != last_life) { - timeout = rtt2; - last_life = life; - stalled = false; - } - - timeout = schedule_timeout(timeout); + schedule(); } remove_wait_queue(&call->waitq, &myself); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 8e547b4d88c8..04e97bab6f28 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -64,9 +64,7 @@ int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t, unsigned int); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); -bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *, - u32 *); -void rxrpc_kernel_probe_life(struct socket *, struct rxrpc_call *); +bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *, ktime_t *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 7603cf811f75..15ee92d79581 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -371,44 +371,17 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * rxrpc_kernel_check_life - Check to see whether a call is still alive * @sock: The socket the call is on * @call: The call to check - * @_life: Where to store the life value * - * Allow a kernel service to find out whether a call is still alive - ie. we're - * getting ACKs from the server. Passes back in *_life a number representing - * the life state which can be compared to that returned by a previous call and - * return true if the call is still alive. - * - * If the life state stalls, rxrpc_kernel_probe_life() should be called and - * then 2RTT waited. + * Allow a kernel service to find out whether a call is still alive - + * ie. whether it has completed. */ bool rxrpc_kernel_check_life(const struct socket *sock, - const struct rxrpc_call *call, - u32 *_life) + const struct rxrpc_call *call) { - *_life = call->acks_latest; return call->state != RXRPC_CALL_COMPLETE; } EXPORT_SYMBOL(rxrpc_kernel_check_life); -/** - * rxrpc_kernel_probe_life - Poke the peer to see if it's still alive - * @sock: The socket the call is on - * @call: The call to check - * - * In conjunction with rxrpc_kernel_check_life(), allow a kernel service to - * find out whether a call is still alive by pinging it. This should cause the - * life state to be bumped in about 2*RTT. - * - * The must be called in TASK_RUNNING state on pain of might_sleep() objecting. - */ -void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call) -{ - rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false, - rxrpc_propose_ack_ping_for_check_life); - rxrpc_send_ack_packet(call, true, NULL); -} -EXPORT_SYMBOL(rxrpc_kernel_probe_life); - /** * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. * @sock: The socket the call is on diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1f72f43b082d..3eb1ab40ca5c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -675,7 +675,6 @@ struct rxrpc_call { /* transmission-phase ACK management */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ - rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */ rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index ef10fbf71b15..69e09d69c896 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -882,7 +882,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) before(prev_pkt, call->ackr_prev_seq)) goto out; call->acks_latest_ts = skb->tstamp; - call->acks_latest = sp->hdr.serial; call->ackr_first_seq = first_soft_ack; call->ackr_prev_seq = prev_pkt; -- cgit v1.2.3 From b1be2e8cd290f620777bfdb8aa00890cd2fa02b5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 11 Mar 2020 22:42:27 -0700 Subject: net_sched: hold rtnl lock in tcindex_partial_destroy_work() syzbot reported a use-after-free in tcindex_dump(). This is due to the lack of RTNL in the deferred rcu work. We queue this work with RTNL in tcindex_change(), later, tcindex_dump() is called: fh = tp->ops->get(tp, t->tcm_handle); ... err = tp->ops->change(..., &fh, ...); tfilter_notify(..., fh, ...); but there is nothing to serialize the pending tcindex_partial_destroy_work() with tcindex_dump(). Fix this by simply holding RTNL in tcindex_partial_destroy_work(), so that it won't be called until RTNL is released after tc_new_tfilter() is completed. Reported-and-tested-by: syzbot+653090db2562495901dc@syzkaller.appspotmail.com Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 09b7dc5fe7e0..f2cb24b6f0cf 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -261,8 +261,10 @@ static void tcindex_partial_destroy_work(struct work_struct *work) struct tcindex_data, rwork); + rtnl_lock(); kfree(p->perfect); kfree(p); + rtnl_unlock(); } static void tcindex_free_perfect_hash(struct tcindex_data *cp) -- cgit v1.2.3 From 0d1c3530e1bd38382edef72591b78e877e0edcd3 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 11 Mar 2020 22:42:28 -0700 Subject: net_sched: keep alloc_hash updated after hash allocation In commit 599be01ee567 ("net_sched: fix an OOB access in cls_tcindex") I moved cp->hash calculation before the first tcindex_alloc_perfect_hash(), but cp->alloc_hash is left untouched. This difference could lead to another out of bound access. cp->alloc_hash should always be the size allocated, we should update it after this tcindex_alloc_perfect_hash(). Reported-and-tested-by: syzbot+dcc34d54d68ef7d2d53d@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+c72da7b9ed57cde6fca2@syzkaller.appspotmail.com Fixes: 599be01ee567 ("net_sched: fix an OOB access in cls_tcindex") Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index f2cb24b6f0cf..9904299424a1 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -359,6 +359,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (tcindex_alloc_perfect_hash(net, cp) < 0) goto errout; + cp->alloc_hash = cp->hash; for (i = 0; i < min(cp->hash, p->hash); i++) cp->perfect[i].res = p->perfect[i].res; balloc = 1; -- cgit v1.2.3 From 13d0f7b814d9b4c67e60d8c2820c86ea181e7d99 Mon Sep 17 00:00:00 2001 From: Bruno Meneguele Date: Thu, 12 Mar 2020 20:08:20 -0300 Subject: net/bpfilter: fix dprintf usage for /dev/kmsg The bpfilter UMH code was recently changed to log its informative messages to /dev/kmsg, however this interface doesn't support SEEK_CUR yet, used by dprintf(). As result dprintf() returns -EINVAL and doesn't log anything. However there already had some discussions about supporting SEEK_CUR into /dev/kmsg interface in the past it wasn't concluded. Since the only user of that from userspace perspective inside the kernel is the bpfilter UMH (userspace) module it's better to correct it here instead waiting a conclusion on the interface. Fixes: 36c4357c63f3 ("net: bpfilter: print umh messages to /dev/kmsg") Signed-off-by: Bruno Meneguele Signed-off-by: David S. Miller --- net/bpfilter/main.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c index 77396a098fbe..efea4874743e 100644 --- a/net/bpfilter/main.c +++ b/net/bpfilter/main.c @@ -10,7 +10,7 @@ #include #include "msgfmt.h" -int debug_fd; +FILE *debug_f; static int handle_get_cmd(struct mbox_request *cmd) { @@ -35,9 +35,10 @@ static void loop(void) struct mbox_reply reply; int n; + fprintf(debug_f, "testing the buffer\n"); n = read(0, &req, sizeof(req)); if (n != sizeof(req)) { - dprintf(debug_fd, "invalid request %d\n", n); + fprintf(debug_f, "invalid request %d\n", n); return; } @@ -47,7 +48,7 @@ static void loop(void) n = write(1, &reply, sizeof(reply)); if (n != sizeof(reply)) { - dprintf(debug_fd, "reply failed %d\n", n); + fprintf(debug_f, "reply failed %d\n", n); return; } } @@ -55,9 +56,10 @@ static void loop(void) int main(void) { - debug_fd = open("/dev/kmsg", 00000002); - dprintf(debug_fd, "Started bpfilter\n"); + debug_f = fopen("/dev/kmsg", "w"); + setvbuf(debug_f, 0, _IOLBF, 0); + fprintf(debug_f, "Started bpfilter\n"); loop(); - close(debug_fd); + fclose(debug_f); return 0; } -- cgit v1.2.3 From e1f8f78ffe9854308b9e12a73ebe4e909074fc33 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 13 Mar 2020 13:39:36 +0200 Subject: net: ip_gre: Separate ERSPAN newlink / changelink callbacks ERSPAN shares most of the code path with GRE and gretap code. While that helps keep the code compact, it is also error prone. Currently a broken userspace can turn a gretap tunnel into a de facto ERSPAN one by passing IFLA_GRE_ERSPAN_VER. There has been a similar issue in ip6gretap in the past. To prevent these problems in future, split the newlink and changelink code paths. Split the ERSPAN code out of ipgre_netlink_parms() into a new function erspan_netlink_parms(). Extract a piece of common logic from ipgre_newlink() and ipgre_changelink() into ipgre_newlink_encap_setup(). Add erspan_newlink() and erspan_changelink(). Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 103 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 85 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8274f98c511c..7765c65fc7d2 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1153,6 +1153,22 @@ static int ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + return 0; +} + +static int erspan_netlink_parms(struct net_device *dev, + struct nlattr *data[], + struct nlattr *tb[], + struct ip_tunnel_parm *parms, + __u32 *fwmark) +{ + struct ip_tunnel *t = netdev_priv(dev); + int err; + + err = ipgre_netlink_parms(dev, data, tb, parms, fwmark); + if (err) + return err; + if (data[IFLA_GRE_ERSPAN_VER]) { t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); @@ -1276,45 +1292,70 @@ static void ipgre_tap_setup(struct net_device *dev) ip_tunnel_setup(dev, gre_tap_net_id); } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) +static int +ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[]) { - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; - __u32 fwmark = 0; - int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - err = ip_tunnel_encap_setup(t, &ipencap); + int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } + return 0; +} + +static int ipgre_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip_tunnel_parm p; + __u32 fwmark = 0; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; return ip_tunnel_newlink(dev, tb, &p, fwmark); } +static int erspan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip_tunnel_parm p; + __u32 fwmark = 0; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + + err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); + if (err) + return err; + return ip_tunnel_newlink(dev, tb, &p, fwmark); +} + static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_encap ipencap; __u32 fwmark = t->fwmark; struct ip_tunnel_parm p; int err; - if (ipgre_netlink_encap_parms(data, &ipencap)) { - err = ip_tunnel_encap_setup(t, &ipencap); - - if (err < 0) - return err; - } + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) @@ -1327,8 +1368,34 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], t->parms.i_flags = p.i_flags; t->parms.o_flags = p.o_flags; - if (strcmp(dev->rtnl_link_ops->kind, "erspan")) - ipgre_link_update(dev, !tb[IFLA_MTU]); + ipgre_link_update(dev, !tb[IFLA_MTU]); + + return 0; +} + +static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip_tunnel *t = netdev_priv(dev); + __u32 fwmark = t->fwmark; + struct ip_tunnel_parm p; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + + err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); + if (err < 0) + return err; + + err = ip_tunnel_changelink(dev, tb, &p, fwmark); + if (err < 0) + return err; + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; return 0; } @@ -1519,8 +1586,8 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = { .priv_size = sizeof(struct ip_tunnel), .setup = erspan_setup, .validate = erspan_validate, - .newlink = ipgre_newlink, - .changelink = ipgre_changelink, + .newlink = erspan_newlink, + .changelink = erspan_changelink, .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, -- cgit v1.2.3 From 61fad6816fc10fb8793a925d5c1256d1c3db0cd2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 13 Mar 2020 12:18:09 -0400 Subject: net/packet: tpacket_rcv: avoid a producer race condition PACKET_RX_RING can cause multiple writers to access the same slot if a fast writer wraps the ring while a slow writer is still copying. This is particularly likely with few, large, slots (e.g., GSO packets). Synchronize kernel thread ownership of rx ring slots with a bitmap. Writers acquire a slot race-free by testing tp_status TP_STATUS_KERNEL while holding the sk receive queue lock. They release this lock before copying and set tp_status to TP_STATUS_USER to release to userspace when done. During copying, another writer may take the lock, also see TP_STATUS_KERNEL, and start writing to the same slot. Introduce a new rx_owner_map bitmap with a bit per slot. To acquire a slot, test and set with the lock held. To release race-free, update tp_status and owner bit as a transaction, so take the lock again. This is the one of a variety of discussed options (see Link below): * instead of a shadow ring, embed the data in the slot itself, such as in tp_padding. But any test for this field may match a value left by userspace, causing deadlock. * avoid the lock on release. This leaves a small race if releasing the shadow slot before setting TP_STATUS_USER. The below reproducer showed that this race is not academic. If releasing the slot after tp_status, the race is more subtle. See the first link for details. * add a new tp_status TP_KERNEL_OWNED to avoid the transactional store of two fields. But, legacy applications may interpret all non-zero tp_status as owned by the user. As libpcap does. So this is possible only opt-in by newer processes. It can be added as an optional mode. * embed the struct at the tail of pg_vec to avoid extra allocation. The implementation proved no less complex than a separate field. The additional locking cost on release adds contention, no different than scaling on multicore or multiqueue h/w. In practice, below reproducer nor small packet tcpdump showed a noticeable change in perf report in cycles spent in spinlock. Where contention is problematic, packet sockets support mitigation through PACKET_FANOUT. And we can consider adding opt-in state TP_KERNEL_OWNED. Easy to reproduce by running multiple netperf or similar TCP_STREAM flows concurrently with `tcpdump -B 129 -n greater 60000`. Based on an earlier patchset by Jon Rosen. See links below. I believe this issue goes back to the introduction of tpacket_rcv, which predates git history. Link: https://www.mail-archive.com/netdev@vger.kernel.org/msg237222.html Suggested-by: Jon Rosen Signed-off-by: Willem de Bruijn Signed-off-by: Jon Rosen Signed-off-by: David S. Miller --- net/packet/af_packet.c | 21 +++++++++++++++++++++ net/packet/internal.h | 5 ++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e5b0986215d2..29bd405adbbd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2173,6 +2173,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct timespec64 ts; __u32 ts_status; bool is_drop_n_account = false; + unsigned int slot_id = 0; bool do_vnet = false; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. @@ -2275,6 +2276,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!h.raw) goto drop_n_account; + if (po->tp_version <= TPACKET_V2) { + slot_id = po->rx_ring.head; + if (test_bit(slot_id, po->rx_ring.rx_owner_map)) + goto drop_n_account; + __set_bit(slot_id, po->rx_ring.rx_owner_map); + } + if (do_vnet && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), @@ -2380,7 +2388,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, #endif if (po->tp_version <= TPACKET_V2) { + spin_lock(&sk->sk_receive_queue.lock); __packet_set_status(po, h.raw, status); + __clear_bit(slot_id, po->rx_ring.rx_owner_map); + spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); } else { prb_clear_blk_fill_status(&po->rx_ring); @@ -4277,6 +4288,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); + unsigned long *rx_owner_map = NULL; int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; @@ -4362,6 +4374,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, } break; default: + if (!tx_ring) { + rx_owner_map = bitmap_alloc(req->tp_frame_nr, + GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); + if (!rx_owner_map) + goto out_free_pg_vec; + } break; } } @@ -4391,6 +4409,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); + if (po->tp_version <= TPACKET_V2) + swap(rb->rx_owner_map, rx_owner_map); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; @@ -4422,6 +4442,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, } out_free_pg_vec: + bitmap_free(rx_owner_map); if (pg_vec) free_pg_vec(pg_vec, order, req->tp_block_nr); out: diff --git a/net/packet/internal.h b/net/packet/internal.h index 82fb2b10f790..907f4cd2a718 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -70,7 +70,10 @@ struct packet_ring_buffer { unsigned int __percpu *pending_refcnt; - struct tpacket_kbdq_core prb_bdqc; + union { + unsigned long *rx_owner_map; + struct tpacket_kbdq_core prb_bdqc; + }; }; extern struct mutex fanout_mutex; -- cgit v1.2.3 From 173756b86803655d70af7732079b3aa935e6ab68 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 13 Mar 2020 06:50:14 +0000 Subject: hsr: use rcu_read_lock() in hsr_get_node_{list/status}() hsr_get_node_{list/status}() are not under rtnl_lock() because they are callback functions of generic netlink. But they use __dev_get_by_index() without rtnl_lock(). So, it would use unsafe data. In order to fix it, rcu_read_lock() and dev_get_by_index_rcu() are used instead of __dev_get_by_index(). Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_framereg.c | 9 ++------- net/hsr/hsr_netlink.c | 39 +++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 3ba7f61be107..a64bb64935a6 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -482,12 +482,9 @@ int hsr_get_node_data(struct hsr_priv *hsr, struct hsr_port *port; unsigned long tdiff; - rcu_read_lock(); node = find_node_by_addr_A(&hsr->node_db, addr); - if (!node) { - rcu_read_unlock(); - return -ENOENT; /* No such entry */ - } + if (!node) + return -ENOENT; ether_addr_copy(addr_b, node->macaddress_B); @@ -522,7 +519,5 @@ int hsr_get_node_data(struct hsr_priv *hsr, *addr_b_ifindex = -1; } - rcu_read_unlock(); - return 0; } diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 8dc0547f01d0..d6760df2ad1f 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -251,15 +251,16 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) if (!na) goto invalid; - hsr_dev = __dev_get_by_index(genl_info_net(info), - nla_get_u32(info->attrs[HSR_A_IFINDEX])); + rcu_read_lock(); + hsr_dev = dev_get_by_index_rcu(genl_info_net(info), + nla_get_u32(info->attrs[HSR_A_IFINDEX])); if (!hsr_dev) - goto invalid; + goto rcu_unlock; if (!is_hsr_master(hsr_dev)) - goto invalid; + goto rcu_unlock; /* Send reply */ - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb_out) { res = -ENOMEM; goto fail; @@ -313,12 +314,10 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq); if (res < 0) goto nla_put_failure; - rcu_read_lock(); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (port) res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX, port->dev->ifindex); - rcu_read_unlock(); if (res < 0) goto nla_put_failure; @@ -328,20 +327,22 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq); if (res < 0) goto nla_put_failure; - rcu_read_lock(); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (port) res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX, port->dev->ifindex); - rcu_read_unlock(); if (res < 0) goto nla_put_failure; + rcu_read_unlock(); + genlmsg_end(skb_out, msg_head); genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid); return 0; +rcu_unlock: + rcu_read_unlock(); invalid: netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL); return 0; @@ -351,6 +352,7 @@ nla_put_failure: /* Fall through */ fail: + rcu_read_unlock(); return res; } @@ -377,15 +379,16 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) if (!na) goto invalid; - hsr_dev = __dev_get_by_index(genl_info_net(info), - nla_get_u32(info->attrs[HSR_A_IFINDEX])); + rcu_read_lock(); + hsr_dev = dev_get_by_index_rcu(genl_info_net(info), + nla_get_u32(info->attrs[HSR_A_IFINDEX])); if (!hsr_dev) - goto invalid; + goto rcu_unlock; if (!is_hsr_master(hsr_dev)) - goto invalid; + goto rcu_unlock; /* Send reply */ - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb_out) { res = -ENOMEM; goto fail; @@ -405,14 +408,11 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) hsr = netdev_priv(hsr_dev); - rcu_read_lock(); pos = hsr_get_next_node(hsr, NULL, addr); while (pos) { res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr); - if (res < 0) { - rcu_read_unlock(); + if (res < 0) goto nla_put_failure; - } pos = hsr_get_next_node(hsr, pos, addr); } rcu_read_unlock(); @@ -422,6 +422,8 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) return 0; +rcu_unlock: + rcu_read_unlock(); invalid: netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL); return 0; @@ -431,6 +433,7 @@ nla_put_failure: /* Fall through */ fail: + rcu_read_unlock(); return res; } -- cgit v1.2.3 From ca19c70f5225771c05bcdcb832b4eb84d7271c5e Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 13 Mar 2020 06:50:24 +0000 Subject: hsr: add restart routine into hsr_get_node_list() The hsr_get_node_list() is to send node addresses to the userspace. If there are so many nodes, it could fail because of buffer size. In order to avoid this failure, the restart routine is added. Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_netlink.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index d6760df2ad1f..726bfe923999 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -360,16 +360,14 @@ fail: */ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) { - /* For receiving */ - struct nlattr *na; + unsigned char addr[ETH_ALEN]; struct net_device *hsr_dev; - - /* For sending */ struct sk_buff *skb_out; - void *msg_head; struct hsr_priv *hsr; - void *pos; - unsigned char addr[ETH_ALEN]; + bool restart = false; + struct nlattr *na; + void *pos = NULL; + void *msg_head; int res; if (!info) @@ -387,8 +385,9 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) if (!is_hsr_master(hsr_dev)) goto rcu_unlock; +restart: /* Send reply */ - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); + skb_out = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb_out) { res = -ENOMEM; goto fail; @@ -402,17 +401,28 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) goto nla_put_failure; } - res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex); - if (res < 0) - goto nla_put_failure; + if (!restart) { + res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex); + if (res < 0) + goto nla_put_failure; + } hsr = netdev_priv(hsr_dev); - pos = hsr_get_next_node(hsr, NULL, addr); + if (!pos) + pos = hsr_get_next_node(hsr, NULL, addr); while (pos) { res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr); - if (res < 0) + if (res < 0) { + if (res == -EMSGSIZE) { + genlmsg_end(skb_out, msg_head); + genlmsg_unicast(genl_info_net(info), skb_out, + info->snd_portid); + restart = true; + goto restart; + } goto nla_put_failure; + } pos = hsr_get_next_node(hsr, pos, addr); } rcu_read_unlock(); @@ -429,7 +439,7 @@ invalid: return 0; nla_put_failure: - kfree_skb(skb_out); + nlmsg_free(skb_out); /* Fall through */ fail: -- cgit v1.2.3 From 09e91dbea0aa32be02d8877bd50490813de56b9a Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 13 Mar 2020 06:50:33 +0000 Subject: hsr: set .netnsok flag The hsr module has been supporting the list and status command. (HSR_C_GET_NODE_LIST and HSR_C_GET_NODE_STATUS) These commands send node information to the user-space via generic netlink. But, in the non-init_net namespace, these commands are not allowed because .netnsok flag is false. So, there is no way to get node information in the non-init_net namespace. Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 726bfe923999..fae21c863b1f 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -470,6 +470,7 @@ static struct genl_family hsr_genl_family __ro_after_init = { .version = 1, .maxattr = HSR_A_MAX, .policy = hsr_genl_policy, + .netnsok = true, .module = THIS_MODULE, .ops = hsr_ops, .n_ops = ARRAY_SIZE(hsr_ops), -- cgit v1.2.3 From ef299cc3fa1a9e1288665a9fdc8bff55629fd359 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 13 Mar 2020 22:29:54 -0700 Subject: net_sched: cls_route: remove the right filter from hashtable route4_change() allocates a new filter and copies values from the old one. After the new filter is inserted into the hash table, the old filter should be removed and freed, as the final step of the update. However, the current code mistakenly removes the new one. This looks apparently wrong to me, and it causes double "free" and use-after-free too, as reported by syzbot. Reported-and-tested-by: syzbot+f9b32aaacd60305d9687@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+2f8c233f131943d6056d@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+9c2df9fd5e9445b74e01@syzkaller.appspotmail.com Fixes: 1109c00547fc ("net: sched: RCU cls_route") Cc: Jamal Hadi Salim Cc: Jiri Pirko Cc: John Fastabend Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 6f8786b06bde..5efa3e7ace15 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -534,8 +534,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, fp = &b->ht[h]; for (pfp = rtnl_dereference(*fp); pfp; fp = &pfp->next, pfp = rtnl_dereference(*fp)) { - if (pfp == f) { - *fp = f->next; + if (pfp == fold) { + rcu_assign_pointer(*fp, fold->next); break; } } -- cgit v1.2.3 From fe2a31d790f81bd14a76de3d3b87f4f1362f60cd Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 15 Mar 2020 18:17:43 +0100 Subject: netlink: allow extack cookie also for error messages Commit ba0dc5f6e0ba ("netlink: allow sending extended ACK with cookie on success") introduced a cookie which can be sent to userspace as part of extended ack message in the form of NLMSGERR_ATTR_COOKIE attribute. Currently the cookie is ignored if error code is non-zero but there is no technical reason for such limitation and it can be useful to provide machine parseable information as part of an error message. Include NLMSGERR_ATTR_COOKIE whenever the cookie has been set, regardless of error code. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5313f1cec170..2f234791b879 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2392,19 +2392,14 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, if (nlk_has_extack && extack && extack->_msg) tlvlen += nla_total_size(strlen(extack->_msg) + 1); - if (err) { - if (!(nlk->flags & NETLINK_F_CAP_ACK)) - payload += nlmsg_len(nlh); - else - flags |= NLM_F_CAPPED; - if (nlk_has_extack && extack && extack->bad_attr) - tlvlen += nla_total_size(sizeof(u32)); - } else { + if (err && !(nlk->flags & NETLINK_F_CAP_ACK)) + payload += nlmsg_len(nlh); + else flags |= NLM_F_CAPPED; - - if (nlk_has_extack && extack && extack->cookie_len) - tlvlen += nla_total_size(extack->cookie_len); - } + if (err && nlk_has_extack && extack && extack->bad_attr) + tlvlen += nla_total_size(sizeof(u32)); + if (nlk_has_extack && extack && extack->cookie_len) + tlvlen += nla_total_size(extack->cookie_len); if (tlvlen) flags |= NLM_F_ACK_TLVS; @@ -2427,20 +2422,16 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); } - if (err) { - if (extack->bad_attr && - !WARN_ON((u8 *)extack->bad_attr < in_skb->data || - (u8 *)extack->bad_attr >= in_skb->data + - in_skb->len)) - WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, - (u8 *)extack->bad_attr - - (u8 *)nlh)); - } else { - if (extack->cookie_len) - WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, - extack->cookie_len, - extack->cookie)); - } + if (err && extack->bad_attr && + !WARN_ON((u8 *)extack->bad_attr < in_skb->data || + (u8 *)extack->bad_attr >= in_skb->data + + in_skb->len)) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, + (u8 *)extack->bad_attr - + (u8 *)nlh)); + if (extack->cookie_len) + WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, + extack->cookie_len, extack->cookie)); } nlmsg_end(skb, rep); -- cgit v1.2.3 From 2363d73a2f3e92787f336721c40918ba2eb0c74c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 15 Mar 2020 18:17:53 +0100 Subject: ethtool: reject unrecognized request flags As pointed out by Jakub Kicinski, we ethtool netlink code should respond with an error if request head has flags set which are not recognized by kernel, either as a mistake or because it expects functionality introduced in later kernel versions. To avoid unnecessary roundtrips, use extack cookie to provide the information about supported request flags. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 180c194fab07..fc9e0b806889 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -40,6 +40,7 @@ int ethnl_parse_header(struct ethnl_req_info *req_info, struct nlattr *tb[ETHTOOL_A_HEADER_MAX + 1]; const struct nlattr *devname_attr; struct net_device *dev = NULL; + u32 flags = 0; int ret; if (!header) { @@ -50,8 +51,17 @@ int ethnl_parse_header(struct ethnl_req_info *req_info, ethnl_header_policy, extack); if (ret < 0) return ret; - devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; + if (tb[ETHTOOL_A_HEADER_FLAGS]) { + flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); + if (flags & ~ETHTOOL_FLAG_ALL) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_FLAGS], + "unrecognized request flags"); + nl_set_extack_cookie_u32(extack, ETHTOOL_FLAG_ALL); + return -EOPNOTSUPP; + } + } + devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); @@ -90,9 +100,7 @@ int ethnl_parse_header(struct ethnl_req_info *req_info, } req_info->dev = dev; - if (tb[ETHTOOL_A_HEADER_FLAGS]) - req_info->flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); - + req_info->flags = flags; return 0; } -- cgit v1.2.3 From 2a9de3af21aa8c31cd68b0b39330d69f8c1e59df Mon Sep 17 00:00:00 2001 From: Torsten Hilbrich Date: Wed, 11 Mar 2020 11:19:06 +0100 Subject: vti6: Fix memory leak of skb if input policy check fails The vti6_rcv function performs some tests on the retrieved tunnel including checking the IP protocol, the XFRM input policy, the source and destination address. In all but one places the skb is released in the error case. When the input policy check fails the network packet is leaked. Using the same goto-label discard in this case to fix this problem. Fixes: ed1efb2aefbb ("ipv6: Add support for IPsec virtual tunnel interfaces") Signed-off-by: Torsten Hilbrich Reviewed-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- net/ipv6/ip6_vti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 56e642efefff..cc6180e08a4f 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -311,7 +311,7 @@ static int vti6_rcv(struct sk_buff *skb) if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); - return 0; + goto discard; } ipv6h = ipv6_hdr(skb); -- cgit v1.2.3 From 32ca98feab8c9076c89c0697c5a85e46fece809d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 16 Mar 2020 19:53:00 +0200 Subject: net: ip_gre: Accept IFLA_INFO_DATA-less configuration The fix referenced below causes a crash when an ERSPAN tunnel is created without passing IFLA_INFO_DATA. Fix by validating passed-in data in the same way as ipgre does. Fixes: e1f8f78ffe98 ("net: ip_gre: Separate ERSPAN newlink / changelink callbacks") Reported-by: syzbot+1b4ebf4dae4e510dd219@syzkaller.appspotmail.com Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7765c65fc7d2..029b24eeafba 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1168,6 +1168,8 @@ static int erspan_netlink_parms(struct net_device *dev, err = ipgre_netlink_parms(dev, data, tb, parms, fwmark); if (err) return err; + if (!data) + return 0; if (data[IFLA_GRE_ERSPAN_VER]) { t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); -- cgit v1.2.3 From 8e7ae2518f5265f0ef09d561748098fde5a87ccd Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 13 Mar 2020 18:02:09 -0700 Subject: bpf: Sanitize the bpf_struct_ops tcp-cc name The bpf_struct_ops tcp-cc name should be sanitized in order to avoid problematic chars (e.g. whitespaces). This patch reuses the bpf_obj_name_cpy() for accepting the same set of characters in order to keep a consistent bpf programming experience. A "size" param is added. Also, the strlen is returned on success so that the caller (like the bpf_tcp_ca here) can error out on empty name. The existing callers of the bpf_obj_name_cpy() only need to change the testing statement to "if (err < 0)". For all these existing callers, the err will be overwritten later, so no extra change is needed for the new strlen return value. v3: - reverse xmas tree style v2: - Save the orig_src to avoid "end - size" (Andrii) Fixes: 0baf26b0fcd7 ("bpf: tcp: Support tcp_congestion_ops in bpf") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200314010209.1131542-1-kafai@fb.com --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 25 ++++++++++++++----------- net/ipv4/bpf_tcp_ca.c | 7 ++----- 3 files changed, 17 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 49b1a70e12c8..212991f6f2a5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -160,6 +160,7 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); +int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; struct bpf_offloaded_map; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c7fb0d4836d..2857b7dda382 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -696,14 +696,15 @@ int bpf_get_file_flag(int flags) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. - * Return 0 on success and < 0 on error. +/* dst and src must have at least "size" number of bytes. + * Return strlen on success and < 0 on error. */ -static int bpf_obj_name_cpy(char *dst, const char *src) +int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) { - const char *end = src + BPF_OBJ_NAME_LEN; + const char *end = src + size; + const char *orig_src = src; - memset(dst, 0, BPF_OBJ_NAME_LEN); + memset(dst, 0, size); /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { if (!isalnum(*src) && @@ -712,11 +713,11 @@ static int bpf_obj_name_cpy(char *dst, const char *src) *dst++ = *src++; } - /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + /* No '\0' found in "size" number of bytes */ if (src == end) return -EINVAL; - return 0; + return src - orig_src; } int map_check_no_btf(const struct bpf_map *map, @@ -810,8 +811,9 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = bpf_obj_name_cpy(map->name, attr->map_name); - if (err) + err = bpf_obj_name_cpy(map->name, attr->map_name, + sizeof(attr->map_name)); + if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); @@ -2098,8 +2100,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); - err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); - if (err) + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, + sizeof(attr->prog_name)); + if (err < 0) goto free_prog; /* run eBPF verifier */ diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 574972bc7299..2bf3abeb1456 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -184,7 +184,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, { const struct tcp_congestion_ops *utcp_ca; struct tcp_congestion_ops *tcp_ca; - size_t tcp_ca_name_len; int prog_fd; u32 moff; @@ -199,13 +198,11 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, tcp_ca->flags = utcp_ca->flags; return 1; case offsetof(struct tcp_congestion_ops, name): - tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name)); - if (!tcp_ca_name_len || - tcp_ca_name_len == sizeof(utcp_ca->name)) + if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name, + sizeof(tcp_ca->name)) <= 0) return -EINVAL; if (tcp_ca_find(utcp_ca->name)) return -EEXIST; - memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)); return 1; } -- cgit v1.2.3 From 2de9780f75076c1a1f122cbd39df0fa545284724 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 17 Mar 2020 15:54:20 +0100 Subject: net: core: dev.c: fix a documentation warning There's a markup for link with is "foo_". On this kernel-doc comment, we don't want this, but instead, place a literal reference. So, escape the literal with ``foo``, in order to avoid this warning: ./net/core/dev.c:5195: WARNING: Unknown target name: "page_is". Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c6c985fe7b1b..402a986659cf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5195,7 +5195,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. - * Caller must also take care of handling if (page_is_)pfmemalloc. + * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. -- cgit v1.2.3 From dd2af10402684cb5840a127caec9e7cdcff6d167 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 18 Mar 2020 12:50:33 +0200 Subject: net/sched: act_ct: Fix leak of ct zone template on replace Currently, on replace, the previous action instance params is swapped with a newly allocated params. The old params is only freed (via kfree_rcu), without releasing the allocated ct zone template related to it. Call tcf_ct_params_free (via call_rcu) for the old params, so it will release it. Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct") Signed-off-by: Paul Blakey Signed-off-by: David S. Miller --- net/sched/act_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index f685c0d73708..41114b463161 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -739,7 +739,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); if (params) - kfree_rcu(params, rcu); + call_rcu(¶ms->rcu, tcf_ct_params_free); if (res == ACT_P_CREATED) tcf_idr_insert(tn, *a); -- cgit v1.2.3 From 61abaf02d2ec3d4575c66fa1b4a863877736b932 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Tue, 17 Mar 2020 10:02:52 +0800 Subject: netfilter: flowtable: reload ip{v6}h in nf_flow_nat_ip{v6} Since nf_flow_snat_port and nf_flow_snat_ip{v6} call pskb_may_pull() which may change skb->data, so we need to reload ip{v6}h at the right place. Fixes: a908fdec3dda ("netfilter: nf_flow_table: move ipv6 offload hook code to nf_flow_table") Fixes: 7d2086871762 ("netfilter: nf_flow_table: move ipv4 offload hook code to nf_flow_table") Signed-off-by: Haishuang Yan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 9e563fd3da0f..22caab7bb755 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -146,11 +146,13 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) + nf_flow_snat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0)) return -1; + + iph = ip_hdr(skb); if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) + nf_flow_dnat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0)) return -1; return 0; @@ -426,11 +428,13 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow, if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + nf_flow_snat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0)) return -1; + + ip6h = ipv6_hdr(skb); if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + nf_flow_dnat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0)) return -1; return 0; -- cgit v1.2.3 From 41e9ec5a54f95eee1a57c8d26ab70e0492548c1b Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Tue, 17 Mar 2020 10:02:53 +0800 Subject: netfilter: flowtable: reload ip{v6}h in nf_flow_tuple_ip{v6} Since pskb_may_pull may change skb->data, so we need to reload ip{v6}h at the right place. Fixes: a908fdec3dda ("netfilter: nf_flow_table: move ipv6 offload hook code to nf_flow_table") Fixes: 7d2086871762 ("netfilter: nf_flow_table: move ipv4 offload hook code to nf_flow_table") Signed-off-by: Haishuang Yan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 22caab7bb755..ba775aecd89a 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -191,6 +191,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, if (!pskb_may_pull(skb, thoff + sizeof(*ports))) return -1; + iph = ip_hdr(skb); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v4.s_addr = iph->saddr; @@ -463,6 +464,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, if (!pskb_may_pull(skb, thoff + sizeof(*ports))) return -1; + ip6h = ipv6_hdr(skb); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v6 = ip6h->saddr; -- cgit v1.2.3 From c921ffe853332584eae4f5905cb2a14a7b3c9932 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 19 Mar 2020 11:52:25 +0200 Subject: netfilter: flowtable: Fix flushing of offloaded flows on free Freeing a flowtable with offloaded flows, the flow are deleted from hardware but are not deleted from the flow table, leaking them, and leaving their offload bit on. Add a second pass of the disabled gc to delete the these flows from the flow table before freeing it. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 8af28e10b4e6..70ebebaf5bc1 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -554,6 +554,9 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); nf_flow_table_offload_flush(flow_table); + if (nf_flowtable_hw_offload(flow_table)) + nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, + flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); -- cgit v1.2.3 From 15ff197237e76c4dab06b7b518afaa4ebb1c43e0 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 19 Mar 2020 19:37:21 +0000 Subject: netfilter: flowtable: populate addr_type mask nf_flow_rule_match() sets control.addr_type in key, so needs to also set the corresponding mask. An exact match is wanted, so mask is all ones. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Edward Cree Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 06f00cdc3891..f2c22c682851 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -87,6 +87,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, default: return -EOPNOTSUPP; } + mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT(key->control.addr_type); mask->basic.n_proto = 0xffff; -- cgit v1.2.3 From b738a185beaab8728943acdb3e67371b8a88185e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Mar 2020 12:49:55 -0700 Subject: tcp: ensure skb->dev is NULL before leaving TCP stack skb->rbnode is sharing three skb fields : next, prev, dev When a packet is sent, TCP keeps the original skb (master) in a rtx queue, which was converted to rbtree a while back. __tcp_transmit_skb() is responsible to clone the master skb, and add the TCP header to the clone before sending it to network layer. skb_clone() already clears skb->next and skb->prev, but copies the master oskb->dev into the clone. We need to clear skb->dev, otherwise lower layers could interpret the value as a pointer to a netdev. This old bug surfaced recently when commit 28f8bfd1ac94 ("netfilter: Support iif matches in POSTROUTING") was merged. Before this netfilter commit, skb->dev value was ignored and changed before reaching dev_queue_xmit() Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue") Fixes: 28f8bfd1ac94 ("netfilter: Support iif matches in POSTROUTING") Signed-off-by: Eric Dumazet Reported-by: Martin Zaharinov Cc: Florian Westphal Cc: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 306e25d743e8..e8cf8fde3d37 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1109,6 +1109,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(!skb)) return -ENOBUFS; + /* retransmit skbs might have a non zero value in skb->dev + * because skb->dev is aliased with skb->rbnode.rb_left + */ + skb->dev = NULL; } inet = inet_sk(sk); -- cgit v1.2.3 From 09952e3e7826119ddd4357c453d54bcc7ef25156 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Mar 2020 20:16:56 -0600 Subject: io_uring: make sure accept honor rlimit nofile Just like commit 4022e7af86be, this fixes the fact that IORING_OP_ACCEPT ends up using get_unused_fd_flags(), which checks current->signal->rlim[] for limits. Add an extra argument to __sys_accept4_file() that allows us to pass in the proper nofile limit, and grab it at request prep time. Acked-by: David S. Miller Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++++- include/linux/socket.h | 3 ++- net/socket.c | 8 +++++--- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/fs/io_uring.c b/fs/io_uring.c index fe5ded7c74ef..3affd96a98ba 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -343,6 +343,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + unsigned long nofile; }; struct io_sync { @@ -3324,6 +3325,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); + accept->nofile = rlimit(RLIMIT_NOFILE); return 0; #else return -EOPNOTSUPP; @@ -3340,7 +3342,8 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, file_flags = force_nonblock ? O_NONBLOCK : 0; ret = __sys_accept4_file(req->file, file_flags, accept->addr, - accept->addr_len, accept->flags); + accept->addr_len, accept->flags, + accept->nofile); if (ret == -EAGAIN && force_nonblock) return -EAGAIN; if (ret == -ERESTARTSYS) diff --git a/include/linux/socket.h b/include/linux/socket.h index 2d2313403101..15f3412d481e 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -401,7 +401,8 @@ extern int __sys_sendto(int fd, void __user *buff, size_t len, int addr_len); extern int __sys_accept4_file(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags); + int __user *upeer_addrlen, int flags, + unsigned long nofile); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); diff --git a/net/socket.c b/net/socket.c index b79a05de7c6e..2eecf1517f76 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1707,7 +1707,8 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) int __sys_accept4_file(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags) + int __user *upeer_addrlen, int flags, + unsigned long nofile) { struct socket *sock, *newsock; struct file *newfile; @@ -1738,7 +1739,7 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, */ __module_get(newsock->ops->owner); - newfd = get_unused_fd_flags(flags); + newfd = __get_unused_fd_flags(flags, nofile); if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); @@ -1807,7 +1808,8 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, f = fdget(fd); if (f.file) { ret = __sys_accept4_file(f.file, 0, upeer_sockaddr, - upeer_addrlen, flags); + upeer_addrlen, flags, + rlimit(RLIMIT_NOFILE)); if (f.flags) fput(f.file); } -- cgit v1.2.3 From 07f8e4d0fddbf2f87e4cefb551278abc38db8cdd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 20 Mar 2020 16:52:02 +0100 Subject: tcp: also NULL skb->dev when copy was needed In rare cases retransmit logic will make a full skb copy, which will not trigger the zeroing added in recent change b738a185beaa ("tcp: ensure skb->dev is NULL before leaving TCP stack"). Cc: Eric Dumazet Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue") Fixes: 28f8bfd1ac94 ("netfilter: Support iif matches in POSTROUTING") Signed-off-by: Florian Westphal Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e8cf8fde3d37..2f45cde168c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3041,8 +3041,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) tcp_skb_tsorted_save(skb) { nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + if (nskb) { + nskb->dev = NULL; + err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); + } else { + err = -ENOBUFS; + } } tcp_skb_tsorted_restore(skb); if (!err) { -- cgit v1.2.3 From 3a303cfdd28d5f930a307c82e8a9d996394d5ebd Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 21 Mar 2020 06:46:50 +0000 Subject: hsr: fix general protection fault in hsr_addr_is_self() The port->hsr is used in the hsr_handle_frame(), which is a callback of rx_handler. hsr master and slaves are initialized in hsr_add_port(). This function initializes several pointers, which includes port->hsr after registering rx_handler. So, in the rx_handler routine, un-initialized pointer would be used. In order to fix this, pointers should be initialized before registering rx_handler. Test commands: ip netns del left ip netns del right modprobe -rv veth modprobe -rv hsr killall ping modprobe hsr ip netns add left ip netns add right ip link add veth0 type veth peer name veth1 ip link add veth2 type veth peer name veth3 ip link add veth4 type veth peer name veth5 ip link set veth1 netns left ip link set veth3 netns right ip link set veth4 netns left ip link set veth5 netns right ip link set veth0 up ip link set veth2 up ip link set veth0 address fc:00:00:00:00:01 ip link set veth2 address fc:00:00:00:00:02 ip netns exec left ip link set veth1 up ip netns exec left ip link set veth4 up ip netns exec right ip link set veth3 up ip netns exec right ip link set veth5 up ip link add hsr0 type hsr slave1 veth0 slave2 veth2 ip a a 192.168.100.1/24 dev hsr0 ip link set hsr0 up ip netns exec left ip link add hsr1 type hsr slave1 veth1 slave2 veth4 ip netns exec left ip a a 192.168.100.2/24 dev hsr1 ip netns exec left ip link set hsr1 up ip netns exec left ip n a 192.168.100.1 dev hsr1 lladdr \ fc:00:00:00:00:01 nud permanent ip netns exec left ip n r 192.168.100.1 dev hsr1 lladdr \ fc:00:00:00:00:01 nud permanent for i in {1..100} do ip netns exec left ping 192.168.100.1 & done ip netns exec left hping3 192.168.100.1 -2 --flood & ip netns exec right ip link add hsr2 type hsr slave1 veth3 slave2 veth5 ip netns exec right ip a a 192.168.100.3/24 dev hsr2 ip netns exec right ip link set hsr2 up ip netns exec right ip n a 192.168.100.1 dev hsr2 lladdr \ fc:00:00:00:00:02 nud permanent ip netns exec right ip n r 192.168.100.1 dev hsr2 lladdr \ fc:00:00:00:00:02 nud permanent for i in {1..100} do ip netns exec right ping 192.168.100.1 & done ip netns exec right hping3 192.168.100.1 -2 --flood & while : do ip link add hsr0 type hsr slave1 veth0 slave2 veth2 ip a a 192.168.100.1/24 dev hsr0 ip link set hsr0 up ip link del hsr0 done Splat looks like: [ 120.954938][ C0] general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1]I [ 120.957761][ C0] KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] [ 120.959064][ C0] CPU: 0 PID: 1511 Comm: hping3 Not tainted 5.6.0-rc5+ #460 [ 120.960054][ C0] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 120.962261][ C0] RIP: 0010:hsr_addr_is_self+0x65/0x2a0 [hsr] [ 120.963149][ C0] Code: 44 24 18 70 73 2f c0 48 c1 eb 03 48 8d 04 13 c7 00 f1 f1 f1 f1 c7 40 04 00 f2 f2 f2 4 [ 120.966277][ C0] RSP: 0018:ffff8880d9c09af0 EFLAGS: 00010206 [ 120.967293][ C0] RAX: 0000000000000006 RBX: 1ffff1101b38135f RCX: 0000000000000000 [ 120.968516][ C0] RDX: dffffc0000000000 RSI: ffff8880d17cb208 RDI: 0000000000000000 [ 120.969718][ C0] RBP: 0000000000000030 R08: ffffed101b3c0e3c R09: 0000000000000001 [ 120.972203][ C0] R10: 0000000000000001 R11: ffffed101b3c0e3b R12: 0000000000000000 [ 120.973379][ C0] R13: ffff8880aaf80100 R14: ffff8880aaf800f2 R15: ffff8880aaf80040 [ 120.974410][ C0] FS: 00007f58e693f740(0000) GS:ffff8880d9c00000(0000) knlGS:0000000000000000 [ 120.979794][ C0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 120.980773][ C0] CR2: 00007ffcb8b38f29 CR3: 00000000afe8e001 CR4: 00000000000606f0 [ 120.981945][ C0] Call Trace: [ 120.982411][ C0] [ 120.982848][ C0] ? hsr_add_node+0x8c0/0x8c0 [hsr] [ 120.983522][ C0] ? rcu_read_lock_held+0x90/0xa0 [ 120.984159][ C0] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 120.984944][ C0] hsr_handle_frame+0x1db/0x4e0 [hsr] [ 120.985597][ C0] ? hsr_nl_nodedown+0x2b0/0x2b0 [hsr] [ 120.986289][ C0] __netif_receive_skb_core+0x6bf/0x3170 [ 120.992513][ C0] ? check_chain_key+0x236/0x5d0 [ 120.993223][ C0] ? do_xdp_generic+0x1460/0x1460 [ 120.993875][ C0] ? register_lock_class+0x14d0/0x14d0 [ 120.994609][ C0] ? __netif_receive_skb_one_core+0x8d/0x160 [ 120.995377][ C0] __netif_receive_skb_one_core+0x8d/0x160 [ 120.996204][ C0] ? __netif_receive_skb_core+0x3170/0x3170 [ ... ] Reported-by: syzbot+fcf5dd39282ceb27108d@syzkaller.appspotmail.com Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_slave.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index fbfd0db182b7..a9104d42aafb 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -145,16 +145,16 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, if (!port) return -ENOMEM; + port->hsr = hsr; + port->dev = dev; + port->type = type; + if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(dev, port); if (res) goto fail_dev_setup; } - port->hsr = hsr; - port->dev = dev; - port->type = type; - list_add_tail_rcu(&port->port_list, &hsr->ports); synchronize_rcu(); -- cgit v1.2.3 From 7614209736fbc4927584d4387faade4f31444fce Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 9 Mar 2020 12:03:14 +0100 Subject: ceph: check POOL_FLAG_FULL/NEARFULL in addition to OSDMAP_FULL/NEARFULL CEPH_OSDMAP_FULL/NEARFULL aren't set since mimic, so we need to consult per-pool flags as well. Unfortunately the backwards compatibility here is lacking: - the change that deprecated OSDMAP_FULL/NEARFULL went into mimic, but was guarded by require_osd_release >= RELEASE_LUMINOUS - it was subsequently backported to luminous in v12.2.2, but that makes no difference to clients that only check OSDMAP_FULL/NEARFULL because require_osd_release is not client-facing -- it is for OSDs Since all kernels are affected, the best we can do here is just start checking both map flags and pool flags and send that to stable. These checks are best effort, so take osdc->lock and look up pool flags just once. Remove the FIXME, since filesystem quotas are checked above and RADOS quotas are reflected in POOL_FLAG_FULL: when the pool reaches its quota, both POOL_FLAG_FULL and POOL_FLAG_FULL_QUOTA are set. Cc: stable@vger.kernel.org Reported-by: Yanhu Cao Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton Acked-by: Sage Weil --- fs/ceph/file.c | 14 +++++++++++--- include/linux/ceph/osdmap.h | 4 ++++ include/linux/ceph/rados.h | 6 ++++-- net/ceph/osdmap.c | 9 +++++++++ 4 files changed, 28 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7e0190b1f821..5a478cd06e11 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1415,10 +1415,13 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; int err, want, got; bool direct_lock = false; + u32 map_flags; + u64 pool_flags; loff_t pos; loff_t limit = max(i_size_read(inode), fsc->max_file_size); @@ -1481,8 +1484,12 @@ retry_snap: goto out; } - /* FIXME: not complete since it doesn't account for being at quota */ - if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL)) { + down_read(&osdc->lock); + map_flags = osdc->osdmap->flags; + pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); + up_read(&osdc->lock); + if ((map_flags & CEPH_OSDMAP_FULL) || + (pool_flags & CEPH_POOL_FLAG_FULL)) { err = -ENOSPC; goto out; } @@ -1575,7 +1582,8 @@ retry_snap: } if (written >= 0) { - if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_NEARFULL)) + if ((map_flags & CEPH_OSDMAP_NEARFULL) || + (pool_flags & CEPH_POOL_FLAG_NEARFULL)) iocb->ki_flags |= IOCB_DSYNC; written = generic_write_sync(iocb, written); } diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index e081b56f1c1d..5e601975745f 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -37,6 +37,9 @@ int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs); #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id together */ #define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */ +#define CEPH_POOL_FLAG_FULL_QUOTA (1ULL << 10) /* pool ran out of quota, + will set FULL too */ +#define CEPH_POOL_FLAG_NEARFULL (1ULL << 11) /* pool is nearfull */ struct ceph_pg_pool_info { struct rb_node node; @@ -304,5 +307,6 @@ extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); +u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id); #endif diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 59bdfd470100..88ed3c5c04c5 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -143,8 +143,10 @@ extern const char *ceph_osd_state_name(int s); /* * osd map flag bits */ -#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ -#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC), + not set since ~luminous */ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC), + not set since ~luminous */ #define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ #define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ #define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4e0de14f80bb..2a6e63a8edbe 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -710,6 +710,15 @@ int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) } EXPORT_SYMBOL(ceph_pg_poolid_by_name); +u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id) +{ + struct ceph_pg_pool_info *pi; + + pi = __lookup_pg_pool(&map->pg_pools, id); + return pi ? pi->flags : 0; +} +EXPORT_SYMBOL(ceph_pg_pool_flags); + static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) { rb_erase(&pi->node, root); -- cgit v1.2.3 From e886274031200bb60965c1b9c49b7acda56a93bd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 10 Mar 2020 16:19:01 +0100 Subject: libceph: fix alloc_msg_with_page_vector() memory leaks Make it so that CEPH_MSG_DATA_PAGES data item can own pages, fixing a bunch of memory leaks for a page vector allocated in alloc_msg_with_page_vector(). Currently, only watch-notify messages trigger this allocation, and normally the page vector is freed either in handle_watch_notify() or by the caller of ceph_osdc_notify(). But if the message is freed before that (e.g. if the session faults while reading in the message or if the notify is stale), we leak the page vector. This was supposed to be fixed by switching to a message-owned pagelist, but that never happened. Fixes: 1907920324f1 ("libceph: support for sending notifies") Reported-by: Roman Penyaev Signed-off-by: Ilya Dryomov Reviewed-by: Roman Penyaev --- include/linux/ceph/messenger.h | 7 ++++--- net/ceph/messenger.c | 9 +++++++-- net/ceph/osd_client.c | 14 +++----------- 3 files changed, 14 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c4458dc6a757..76371aaae2d1 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -175,9 +175,10 @@ struct ceph_msg_data { #endif /* CONFIG_BLOCK */ struct ceph_bvec_iter bvec_pos; struct { - struct page **pages; /* NOT OWNER. */ + struct page **pages; size_t length; /* total # bytes */ unsigned int alignment; /* first page */ + bool own_pages; }; struct ceph_pagelist *pagelist; }; @@ -356,8 +357,8 @@ extern void ceph_con_keepalive(struct ceph_connection *con); extern bool ceph_con_keepalive_expired(struct ceph_connection *con, unsigned long interval); -extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, - size_t length, size_t alignment); +void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, + size_t length, size_t alignment, bool own_pages); extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 5b4bd8261002..f8ca5edc5f2c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3248,12 +3248,16 @@ static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg) static void ceph_msg_data_destroy(struct ceph_msg_data *data) { - if (data->type == CEPH_MSG_DATA_PAGELIST) + if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) { + int num_pages = calc_pages_for(data->alignment, data->length); + ceph_release_page_vector(data->pages, num_pages); + } else if (data->type == CEPH_MSG_DATA_PAGELIST) { ceph_pagelist_release(data->pagelist); + } } void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, - size_t length, size_t alignment) + size_t length, size_t alignment, bool own_pages) { struct ceph_msg_data *data; @@ -3265,6 +3269,7 @@ void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, data->pages = pages; data->length = length; data->alignment = alignment & ~PAGE_MASK; + data->own_pages = own_pages; msg->data_length += length; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b68b376d8c2f..af868d3923b9 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -962,7 +962,7 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, BUG_ON(length > (u64) SIZE_MAX); if (length) ceph_msg_data_add_pages(msg, osd_data->pages, - length, osd_data->alignment); + length, osd_data->alignment, false); } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { BUG_ON(!length); ceph_msg_data_add_pagelist(msg, osd_data->pagelist); @@ -4436,9 +4436,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc, CEPH_MSG_DATA_PAGES); *lreq->preply_pages = data->pages; *lreq->preply_len = data->length; - } else { - ceph_release_page_vector(data->pages, - calc_pages_for(0, data->length)); + data->own_pages = false; } } lreq->notify_finish_error = return_code; @@ -5506,9 +5504,6 @@ out_unlock_osdc: return m; } -/* - * TODO: switch to a msg-owned pagelist - */ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) { struct ceph_msg *m; @@ -5522,7 +5517,6 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) if (data_len) { struct page **pages; - struct ceph_osd_data osd_data; pages = ceph_alloc_page_vector(calc_pages_for(0, data_len), GFP_NOIO); @@ -5531,9 +5525,7 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) return NULL; } - ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false, - false); - ceph_osdc_msg_data_add(m, &osd_data); + ceph_msg_data_add_pages(m, pages, data_len, 0, true); } return m; -- cgit v1.2.3 From 6cd6cbf593bfa3ae6fc3ed34ac21da4d35045425 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Mar 2020 19:21:02 -0700 Subject: tcp: repair: fix TCP_QUEUE_SEQ implementation When application uses TCP_QUEUE_SEQ socket option to change tp->rcv_next, we must also update tp->copied_seq. Otherwise, stuff relying on tcp_inq() being precise can eventually be confused. For example, tcp_zerocopy_receive() might crash because it does not expect tcp_recv_skb() to return NULL. We could add tests in various places to fix the issue, or simply make sure tcp_inq() wont return a random value, and leave fast path as it is. Note that this fixes ioctl(fd, SIOCINQ, &val) at the same time. Fixes: ee9952831cfd ("tcp: Initial repair mode") Fixes: 05255b823a61 ("tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eb2d80519f8e..dc77c303e6f7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2948,8 +2948,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EPERM; else if (tp->repair_queue == TCP_SEND_QUEUE) WRITE_ONCE(tp->write_seq, val); - else if (tp->repair_queue == TCP_RECV_QUEUE) + else if (tp->repair_queue == TCP_RECV_QUEUE) { WRITE_ONCE(tp->rcv_nxt, val); + WRITE_ONCE(tp->copied_seq, val); + } else err = -EINVAL; break; -- cgit v1.2.3 From dddeb30bfc43926620f954266fd12c65a7206f07 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 19 Mar 2020 22:54:21 -0400 Subject: ipv4: fix a RCU-list lock in inet_dump_fib() There is a place, inet_dump_fib() fib_table_dump fn_trie_dump_leaf() hlist_for_each_entry_rcu() without rcu_read_lock() will trigger a warning, WARNING: suspicious RCU usage ----------------------------- net/ipv4/fib_trie.c:2216 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by ip/1923: #0: ffffffff8ce76e40 (rtnl_mutex){+.+.}, at: netlink_dump+0xd6/0x840 Call Trace: dump_stack+0xa1/0xea lockdep_rcu_suspicious+0x103/0x10d fn_trie_dump_leaf+0x581/0x590 fib_table_dump+0x15f/0x220 inet_dump_fib+0x4ad/0x5d0 netlink_dump+0x350/0x840 __netlink_dump_start+0x315/0x3e0 rtnetlink_rcv_msg+0x4d1/0x720 netlink_rcv_skb+0xf0/0x220 rtnetlink_rcv+0x15/0x20 netlink_unicast+0x306/0x460 netlink_sendmsg+0x44b/0x770 __sys_sendto+0x259/0x270 __x64_sys_sendto+0x80/0xa0 do_syscall_64+0x69/0xf4 entry_SYSCALL_64_after_hwframe+0x49/0xb3 Fixes: 18a8021a7be3 ("net/ipv4: Plumb support for filtering route dumps") Signed-off-by: Qian Cai Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 577db1d50a24..213be9c050ad 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -997,7 +997,9 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) return -ENOENT; } + rcu_read_lock(); err = fib_table_dump(tb, skb, cb, &filter); + rcu_read_unlock(); return skb->len ? : err; } -- cgit v1.2.3 From 0e62f543bed03a64495bd2651d4fe1aa4bcb7fe5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sun, 22 Mar 2020 13:58:50 -0700 Subject: net: dsa: Fix duplicate frames flooded by learning When both the switch and the bridge are learning about new addresses, switch ports attached to the bridge would see duplicate ARP frames because both entities would attempt to send them. Fixes: 5037d532b83d ("net: dsa: add Broadcom tag RX/TX handler") Reported-by: Maxime Bizon Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/tag_brcm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 9c3114179690..9169b63a89e3 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -140,6 +140,8 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); + skb->offload_fwd_mark = 1; + return skb; } #endif -- cgit v1.2.3 From 2f599ec422ad6634fb5ad43748b9969ca9d742bd Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 22 Mar 2020 22:24:21 +0100 Subject: ethtool: fix reference leak in some *_SET handlers Andrew noticed that some handlers for *_SET commands leak a netdev reference if required ethtool_ops callbacks do not exist. A simple reproducer would be e.g. ip link add veth1 type veth peer name veth2 ethtool -s veth1 wol g ip link del veth1 Make sure dev_put() is called when ethtool_ops check fails. v2: add Fixes tags Fixes: a53f3d41e4d3 ("ethtool: set link settings with LINKINFO_SET request") Fixes: bfbcfe2032e7 ("ethtool: set link modes related data with LINKMODES_SET request") Fixes: e54d04e3afea ("ethtool: set message mask with DEBUG_SET request") Fixes: 8d425b19b305 ("ethtool: set wake-on-lan settings with WOL_SET request") Reported-by: Andrew Lunn Signed-off-by: Michal Kubecek Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ethtool/debug.c | 4 +++- net/ethtool/linkinfo.c | 4 +++- net/ethtool/linkmodes.c | 4 +++- net/ethtool/wol.c | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c index aaef4843e6ba..92599ad7b3c2 100644 --- a/net/ethtool/debug.c +++ b/net/ethtool/debug.c @@ -107,8 +107,9 @@ int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -129,6 +130,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 5d16cb4e8693..6e9e0b590bb5 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -126,9 +126,10 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_link_ksettings || !dev->ethtool_ops->set_link_ksettings) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -162,6 +163,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 96f20be64553..18cc37be2d9c 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -338,9 +338,10 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_link_ksettings || !dev->ethtool_ops->set_link_ksettings) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -370,6 +371,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c index e1b8a65b64c4..55e1ecaaf739 100644 --- a/net/ethtool/wol.c +++ b/net/ethtool/wol.c @@ -128,8 +128,9 @@ int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -172,6 +173,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } -- cgit v1.2.3 From 4c59406ed00379c8663f8663d82b2537467ce9d7 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Mar 2020 15:32:39 +0800 Subject: xfrm: policy: Fix doulbe free in xfrm_policy_timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After xfrm_add_policy add a policy, its ref is 2, then xfrm_policy_timer read_lock xp->walk.dead is 0 .... mod_timer() xfrm_policy_kill policy->walk.dead = 1 .... del_timer(&policy->timer) xfrm_pol_put //ref is 1 xfrm_pol_put //ref is 0 xfrm_policy_destroy call_rcu xfrm_pol_hold //ref is 1 read_unlock xfrm_pol_put //ref is 0 xfrm_policy_destroy call_rcu xfrm_policy_destroy is called twice, which may leads to double free. Call Trace: RIP: 0010:refcount_warn_saturate+0x161/0x210 ... xfrm_policy_timer+0x522/0x600 call_timer_fn+0x1b3/0x5e0 ? __xfrm_decode_session+0x2990/0x2990 ? msleep+0xb0/0xb0 ? _raw_spin_unlock_irq+0x24/0x40 ? __xfrm_decode_session+0x2990/0x2990 ? __xfrm_decode_session+0x2990/0x2990 run_timer_softirq+0x5c5/0x10e0 Fix this by use write_lock_bh in xfrm_policy_kill. Fixes: ea2dea9dacc2 ("xfrm: remove policy lock when accessing policy->walk.dead") Signed-off-by: YueHaibing Acked-by: Timo Teräs Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 297d1eb79e5c..96a8c452b63e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -434,7 +434,9 @@ EXPORT_SYMBOL(xfrm_policy_destroy); static void xfrm_policy_kill(struct xfrm_policy *policy) { + write_lock_bh(&policy->lock); policy->walk.dead = 1; + write_unlock_bh(&policy->lock); atomic_inc(&policy->genid); -- cgit v1.2.3 From 8c2d45b2b65ca1f215244be1c600236e83f9815f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 22 Mar 2020 03:21:58 +0100 Subject: netfilter: nf_tables: Allow set back-ends to report partial overlaps on insertion Currently, the -EEXIST return code of ->insert() callbacks is ambiguous: it might indicate that a given element (including intervals) already exists as such, or that the new element would clash with existing ones. If identical elements already exist, the front-end is ignoring this without returning error, in case NLM_F_EXCL is not set. However, if the new element can't be inserted due an overlap, we should report this to the user. To this purpose, allow set back-ends to return -ENOTEMPTY on collision with existing elements, translate that to -EEXIST, and return that to userspace, no matter if NLM_F_EXCL was set. Reported-by: Phil Sutter Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 38c680f28f15..d11f1a74d43c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5082,6 +5082,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EBUSY; else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; + } else if (err == -ENOTEMPTY) { + /* ENOTEMPTY reports overlapping between this element + * and an existing one. + */ + err = -EEXIST; } goto err_element_clash; } -- cgit v1.2.3 From 0eb4b5ee33f2461d149408a247af8ae24756a6ca Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sun, 22 Mar 2020 03:21:59 +0100 Subject: netfilter: nft_set_pipapo: Separate partial and complete overlap cases on insertion ...and return -ENOTEMPTY to the front-end on collision, -EEXIST if an identical element already exists. Together with the previous patch, element collision will now be returned to the user as -EEXIST. Reported-by: Phil Sutter Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 4fc0c924ed5d..ef7e8ad2e344 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1098,21 +1098,41 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, struct nft_pipapo_field *f; int i, bsize_max, err = 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) + end = (const u8 *)nft_set_ext_key_end(ext)->data; + else + end = start; + dup = pipapo_get(net, set, start, genmask); - if (PTR_ERR(dup) == -ENOENT) { - if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) { - end = (const u8 *)nft_set_ext_key_end(ext)->data; - dup = pipapo_get(net, set, end, nft_genmask_next(net)); - } else { - end = start; + if (!IS_ERR(dup)) { + /* Check if we already have the same exact entry */ + const struct nft_data *dup_key, *dup_end; + + dup_key = nft_set_ext_key(&dup->ext); + if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END)) + dup_end = nft_set_ext_key_end(&dup->ext); + else + dup_end = dup_key; + + if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && + !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { + *ext2 = &dup->ext; + return -EEXIST; } + + return -ENOTEMPTY; + } + + if (PTR_ERR(dup) == -ENOENT) { + /* Look for partially overlapping entries */ + dup = pipapo_get(net, set, end, nft_genmask_next(net)); } if (PTR_ERR(dup) != -ENOENT) { if (IS_ERR(dup)) return PTR_ERR(dup); *ext2 = &dup->ext; - return -EEXIST; + return -ENOTEMPTY; } /* Validate */ -- cgit v1.2.3 From 6f7c9caf017be8ab0fe3b99509580d0793bf0833 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sun, 22 Mar 2020 03:22:00 +0100 Subject: netfilter: nft_set_rbtree: Introduce and use nft_rbtree_interval_start() Replace negations of nft_rbtree_interval_end() with a new helper, nft_rbtree_interval_start(), wherever this helps to visualise the problem at hand, that is, for all the occurrences except for the comparison against given flags in __nft_rbtree_get(). This gets especially useful in the next patch. Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 5000b938ab1e..85572b2a6051 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -33,6 +33,11 @@ static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); } +static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) +{ + return !nft_rbtree_interval_end(rbe); +} + static bool nft_rbtree_equal(const struct nft_set *set, const void *this, const struct nft_rbtree_elem *interval) { @@ -64,7 +69,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (interval && nft_rbtree_equal(set, this, interval) && nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(interval)) + nft_rbtree_interval_start(interval)) continue; interval = rbe; } else if (d > 0) @@ -89,7 +94,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_interval_end(interval)) { + nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; } @@ -224,9 +229,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, p = &parent->rb_right; else { if (nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(new)) { + nft_rbtree_interval_start(new)) { p = &parent->rb_left; - } else if (!nft_rbtree_interval_end(rbe) && + } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(new)) { p = &parent->rb_right; } else if (nft_set_elem_active(&rbe->ext, genmask)) { @@ -317,10 +322,10 @@ static void *nft_rbtree_deactivate(const struct net *net, parent = parent->rb_right; else { if (nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(this)) { + nft_rbtree_interval_start(this)) { parent = parent->rb_left; continue; - } else if (!nft_rbtree_interval_end(rbe) && + } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; -- cgit v1.2.3 From 7c84d41416d836ef7e533bd4d64ccbdf40c5ac70 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sun, 22 Mar 2020 03:22:01 +0100 Subject: netfilter: nft_set_rbtree: Detect partial overlaps on insertion ...and return -ENOTEMPTY to the front-end in this case, instead of proceeding. Currently, nft takes care of checking for these cases and not sending them to the kernel, but if we drop the set_overlap() call in nft we can end up in situations like: # nft add table t # nft add set t s '{ type inet_service ; flags interval ; }' # nft add element t s '{ 1 - 5 }' # nft add element t s '{ 6 - 10 }' # nft add element t s '{ 4 - 7 }' # nft list set t s table ip t { set s { type inet_service flags interval elements = { 1-3, 4-5, 6-7 } } } This change has the primary purpose of making the behaviour consistent with nft_set_pipapo, but is also functional to avoid inconsistent behaviour if userspace sends overlapping elements for any reason. v2: When we meet the same key data in the tree, as start element while inserting an end element, or as end element while inserting a start element, actually check that the existing element is active, before resetting the overlap flag (Pablo Neira Ayuso) Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 70 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 85572b2a6051..8617fc16a1ed 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -213,8 +213,43 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, u8 genmask = nft_genmask_next(net); struct nft_rbtree_elem *rbe; struct rb_node *parent, **p; + bool overlap = false; int d; + /* Detect overlaps as we descend the tree. Set the flag in these cases: + * + * a1. |__ _ _? >|__ _ _ (insert start after existing start) + * a2. _ _ __>| ?_ _ __| (insert end before existing end) + * a3. _ _ ___| ?_ _ _>| (insert end after existing end) + * a4. >|__ _ _ _ _ __| (insert start before existing end) + * + * and clear it later on, as we eventually reach the points indicated by + * '?' above, in the cases described below. We'll always meet these + * later, locally, due to tree ordering, and overlaps for the intervals + * that are the closest together are always evaluated last. + * + * b1. |__ _ _! >|__ _ _ (insert start after existing end) + * b2. _ _ __>| !_ _ __| (insert end before existing start) + * b3. !_____>| (insert end after existing start) + * + * Case a4. resolves to b1.: + * - if the inserted start element is the leftmost, because the '0' + * element in the tree serves as end element + * - otherwise, if an existing end is found. Note that end elements are + * always inserted after corresponding start elements. + * + * For a new, rightmost pair of elements, we'll hit cases b1. and b3., + * in that order. + * + * The flag is also cleared in two special cases: + * + * b4. |__ _ _!|<_ _ _ (insert start right before existing end) + * b5. |__ _ >|!__ _ _ (insert end right after existing start) + * + * which always happen as last step and imply that no further + * overlapping is possible. + */ + parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { @@ -223,17 +258,42 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, d = memcmp(nft_set_ext_key(&rbe->ext), nft_set_ext_key(&new->ext), set->klen); - if (d < 0) + if (d < 0) { p = &parent->rb_left; - else if (d > 0) + + if (nft_rbtree_interval_start(new)) { + overlap = nft_rbtree_interval_start(rbe) && + nft_set_elem_active(&rbe->ext, + genmask); + } else { + overlap = nft_rbtree_interval_end(rbe) && + nft_set_elem_active(&rbe->ext, + genmask); + } + } else if (d > 0) { p = &parent->rb_right; - else { + + if (nft_rbtree_interval_end(new)) { + overlap = nft_rbtree_interval_end(rbe) && + nft_set_elem_active(&rbe->ext, + genmask); + } else if (nft_rbtree_interval_end(rbe) && + nft_set_elem_active(&rbe->ext, genmask)) { + overlap = true; + } + } else { if (nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(new)) { p = &parent->rb_left; + + if (nft_set_elem_active(&rbe->ext, genmask)) + overlap = false; } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(new)) { p = &parent->rb_right; + + if (nft_set_elem_active(&rbe->ext, genmask)) + overlap = false; } else if (nft_set_elem_active(&rbe->ext, genmask)) { *ext = &rbe->ext; return -EEXIST; @@ -242,6 +302,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, } } } + + if (overlap) + return -ENOTEMPTY; + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; -- cgit v1.2.3 From 76a109fac206e158eb3c967af98c178cff738e6a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 23 Mar 2020 14:27:16 +0100 Subject: netfilter: nft_fwd_netdev: validate family and chain type Make sure the forward action is only used from ingress. Fixes: 39e6dea28adc ("netfilter: nf_tables: add forward expression to the netdev family") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_fwd_netdev.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index aba11c2333f3..ddd28de810b6 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -190,6 +190,13 @@ nla_put_failure: return -1; } +static int nft_fwd_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); +} + static struct nft_expr_type nft_fwd_netdev_type; static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .type = &nft_fwd_netdev_type, @@ -197,6 +204,7 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .eval = nft_fwd_neigh_eval, .init = nft_fwd_neigh_init, .dump = nft_fwd_neigh_dump, + .validate = nft_fwd_validate, }; static const struct nft_expr_ops nft_fwd_netdev_ops = { @@ -205,6 +213,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .eval = nft_fwd_netdev_eval, .init = nft_fwd_netdev_init, .dump = nft_fwd_netdev_dump, + .validate = nft_fwd_validate, .offload = nft_fwd_netdev_offload, }; -- cgit v1.2.3 From bcfabee1afd99484b6ba067361b8678e28bbc065 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 23 Mar 2020 19:53:10 +0100 Subject: netfilter: nft_fwd_netdev: allow to redirect to ifb via ingress Set skb->tc_redirected to 1, otherwise the ifb driver drops the packet. Set skb->tc_from_ingress to 1 to reinject the packet back to the ingress path after leaving the ifb egress path. This patch inconditionally sets on these two skb fields that are meaningful to the ifb driver. The existing forward action is guaranteed to run from ingress path. Fixes: 39e6dea28adc ("netfilter: nf_tables: add forward expression to the netdev family") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_fwd_netdev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index ddd28de810b6..74f050ba6bad 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -28,6 +28,10 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; + /* These are used by ifb only. */ + pkt->skb->tc_redirected = 1; + pkt->skb->tc_from_ingress = 1; + nf_fwd_netdev_egress(pkt, oif); regs->verdict.code = NF_STOLEN; } -- cgit v1.2.3 From 961d0e5b32946703125964f9f5b6321d60f4d706 Mon Sep 17 00:00:00 2001 From: Zh-yuan Ye Date: Tue, 24 Mar 2020 17:28:25 +0900 Subject: net: cbs: Fix software cbs to consider packet sending time Currently the software CBS does not consider the packet sending time when depleting the credits. It caused the throughput to be Idleslope[kbps] * (Port transmit rate[kbps] / |Sendslope[kbps]|) where Idleslope * (Port transmit rate / (Idleslope + |Sendslope|)) = Idleslope is expected. In order to fix the issue above, this patch takes the time when the packet sending completes into account by moving the anchor time variable "last" ahead to the send completion time upon transmission and adding wait when the next dequeue request comes before the send completion time of the previous packet. changelog: V2->V3: - remove unnecessary whitespace cleanup - add the checks if port_rate is 0 before division V1->V2: - combine variable "send_completed" into "last" - add the comment for estimate of the packet sending Fixes: 585d763af09c ("net/sched: Introduce Credit Based Shaper (CBS) qdisc") Signed-off-by: Zh-yuan Ye Reviewed-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index b2905b03a432..2eaac2ff380f 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -181,6 +181,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) s64 credits; int len; + /* The previous packet is still being sent */ + if (now < q->last) { + qdisc_watchdog_schedule_ns(&q->watchdog, q->last); + return NULL; + } if (q->credits < 0) { credits = timediff_to_credits(now - q->last, q->idleslope); @@ -212,7 +217,12 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) credits += q->credits; q->credits = max_t(s64, credits, q->locredit); - q->last = now; + /* Estimate of the transmission of the last byte of the packet in ns */ + if (unlikely(atomic64_read(&q->port_rate) == 0)) + q->last = now; + else + q->last = now + div64_s64(len * NSEC_PER_SEC, + atomic64_read(&q->port_rate)); return skb; } -- cgit v1.2.3 From e80f40cbe4dd51371818e967d40da8fe305db5e4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 24 Mar 2020 11:45:34 +0200 Subject: net: dsa: tag_8021q: replace dsa_8021q_remove_header with __skb_vlan_pop Not only did this wheel did not need reinventing, but there is also an issue with it: It doesn't remove the VLAN header in a way that preserves the L2 payload checksum when that is being provided by the DSA master hw. It should recalculate checksum both for the push, before removing the header, and for the pull afterwards. But the current implementation is quite dizzying, with pulls followed immediately afterwards by pushes, the memmove is done before the push, etc. This makes a DSA master with RX checksumming offload to print stack traces with the infamous 'hw csum failure' message. So remove the dsa_8021q_remove_header function and replace it with something that actually works with inet checksumming. Fixes: d461933638ae ("net: dsa: tag_8021q: Create helper function for removing VLAN header") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 7 ------- net/dsa/tag_8021q.c | 43 ------------------------------------------- net/dsa/tag_sja1105.c | 19 +++++++++---------- 3 files changed, 9 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 0aa803c451a3..c620d9139c28 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -28,8 +28,6 @@ int dsa_8021q_rx_switch_id(u16 vid); int dsa_8021q_rx_source_port(u16 vid); -struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb); - #else int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, @@ -64,11 +62,6 @@ int dsa_8021q_rx_source_port(u16 vid) return 0; } -struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) -{ - return NULL; -} - #endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */ #endif /* _NET_DSA_8021Q_H */ diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 2fb6c26294b5..b97ad93d1c1a 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -298,47 +298,4 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -/* In the DSA packet_type handler, skb->data points in the middle of the VLAN - * tag, after tpid and before tci. This is because so far, ETH_HLEN - * (DMAC, SMAC, EtherType) bytes were pulled. - * There are 2 bytes of VLAN tag left in skb->data, and upper - * layers expect the 'real' EtherType to be consumed as well. - * Coincidentally, a VLAN header is also of the same size as - * the number of bytes that need to be pulled. - * - * skb_mac_header skb->data - * | | - * v v - * | | | | | | | | | | | | | | | | | | | - * +-----------------------+-----------------------+-------+-------+-------+ - * | Destination MAC | Source MAC | TPID | TCI | EType | - * +-----------------------+-----------------------+-------+-------+-------+ - * ^ | | - * |<--VLAN_HLEN-->to <---VLAN_HLEN---> - * from | - * >>>>>>> v - * >>>>>>> | | | | | | | | | | | | | | | - * >>>>>>> +-----------------------+-----------------------+-------+ - * >>>>>>> | Destination MAC | Source MAC | EType | - * +-----------------------+-----------------------+-------+ - * ^ ^ - * (now part of | | - * skb->head) skb_mac_header skb->data - */ -struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) -{ - u8 *from = skb_mac_header(skb); - u8 *dest = from + VLAN_HLEN; - - memmove(dest, from, ETH_HLEN - VLAN_HLEN); - skb_pull(skb, VLAN_HLEN); - skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - skb_pull_rcsum(skb, ETH_HLEN); - - return skb; -} -EXPORT_SYMBOL_GPL(dsa_8021q_remove_header); - MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 5366ea430349..d553bf36bd41 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -250,14 +250,14 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, { struct sja1105_meta meta = {0}; int source_port, switch_id; - struct vlan_ethhdr *hdr; + struct ethhdr *hdr; u16 tpid, vid, tci; bool is_link_local; bool is_tagged; bool is_meta; - hdr = vlan_eth_hdr(skb); - tpid = ntohs(hdr->h_vlan_proto); + hdr = eth_hdr(skb); + tpid = ntohs(hdr->h_proto); is_tagged = (tpid == ETH_P_SJA1105); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); @@ -266,7 +266,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, if (is_tagged) { /* Normal traffic path. */ - tci = ntohs(hdr->h_vlan_TCI); + skb_push_rcsum(skb, ETH_HLEN); + __skb_vlan_pop(skb, &tci); + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + vid = tci & VLAN_VID_MASK; source_port = dsa_8021q_rx_source_port(vid); switch_id = dsa_8021q_rx_switch_id(vid); @@ -295,12 +300,6 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } - /* Delete/overwrite fake VLAN header, DSA expects to not find - * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN). - */ - if (is_tagged) - skb = dsa_8021q_remove_header(skb); - return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, is_meta); } -- cgit v1.2.3 From 0016d3201753b59f3ae84b868fe66c86ad256f19 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 25 Mar 2020 09:05:32 +0100 Subject: nl80211: fix NL80211_ATTR_CHANNEL_WIDTH attribute type The new opmode notification used this attribute with a u8, when it's documented as a u32 and indeed used in userspace as such, it just happens to work on little-endian systems since userspace isn't doing any strict size validation, and the u8 goes into the lower byte. Fix this. Cc: stable@vger.kernel.org Fixes: 466b9936bf93 ("cfg80211: Add support to notify station's opmode change to userspace") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200325090531.be124f0a11c7.Iedbf4e197a85471ebd729b186d5365c0343bf7a8@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ec5d67794aab..f0af23c1634a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -16416,7 +16416,7 @@ void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, goto nla_put_failure; if ((sta_opmode->changed & STA_OPMODE_MAX_BW_CHANGED) && - nla_put_u8(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw)) + nla_put_u32(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw)) goto nla_put_failure; if ((sta_opmode->changed & STA_OPMODE_N_SS_CHANGED) && -- cgit v1.2.3 From 2c64605b590edadb3fb46d1ec6badb49e940b479 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Mar 2020 13:47:18 +0100 Subject: net: Fix CONFIG_NET_CLS_ACT=n and CONFIG_NFT_FWD_NETDEV={y, m} build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/netfilter/nft_fwd_netdev.c: In function ‘nft_fwd_netdev_eval’: net/netfilter/nft_fwd_netdev.c:32:10: error: ‘struct sk_buff’ has no member named ‘tc_redirected’ pkt->skb->tc_redirected = 1; ^~ net/netfilter/nft_fwd_netdev.c:33:10: error: ‘struct sk_buff’ has no member named ‘tc_from_ingress’ pkt->skb->tc_from_ingress = 1; ^~ To avoid a direct dependency with tc actions from netfilter, wrap the redirect bits around CONFIG_NET_REDIRECT and move helpers to include/linux/skbuff.h. Turn on this toggle from the ifb driver, the only existing client of these bits in the tree. This patch adds skb_set_redirected() that sets on the redirected bit on the skbuff, it specifies if the packet was redirect from ingress and resets the timestamp (timestamp reset was originally missing in the netfilter bugfix). Fixes: bcfabee1afd99484 ("netfilter: nft_fwd_netdev: allow to redirect to ifb via ingress") Reported-by: noreply@ellerman.id.au Reported-by: Geert Uytterhoeven Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- drivers/net/Kconfig | 1 + drivers/net/ifb.c | 6 +++--- drivers/net/wireguard/queueing.h | 2 +- include/linux/skbuff.h | 36 ++++++++++++++++++++++++++++++++---- include/net/sch_generic.h | 16 ---------------- net/Kconfig | 3 +++ net/core/dev.c | 4 ++-- net/core/pktgen.c | 2 +- net/netfilter/nft_fwd_netdev.c | 5 ++--- net/sched/act_mirred.c | 6 ++---- 10 files changed, 47 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 25a8f9387d5a..db8884ad6d40 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -149,6 +149,7 @@ config NET_FC config IFB tristate "Intermediate Functional Block support" depends on NET_CLS_ACT + select NET_REDIRECT ---help--- This is an intermediate driver that allows sharing of resources. diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index 242b9b0943f8..7fe306e76281 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -75,7 +75,7 @@ static void ifb_ri_tasklet(unsigned long _txp) } while ((skb = __skb_dequeue(&txp->tq)) != NULL) { - skb->tc_redirected = 0; + skb->redirected = 0; skb->tc_skip_classify = 1; u64_stats_update_begin(&txp->tsync); @@ -96,7 +96,7 @@ static void ifb_ri_tasklet(unsigned long _txp) rcu_read_unlock(); skb->skb_iif = txp->dev->ifindex; - if (!skb->tc_from_ingress) { + if (!skb->from_ingress) { dev_queue_xmit(skb); } else { skb_pull_rcsum(skb, skb->mac_len); @@ -243,7 +243,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev) txp->rx_bytes += skb->len; u64_stats_update_end(&txp->rsync); - if (!skb->tc_redirected || !skb->skb_iif) { + if (!skb->redirected || !skb->skb_iif) { dev_kfree_skb(skb); dev->stats.rx_dropped++; return NETDEV_TX_OK; diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index cf1e0e2376d8..3432232afe06 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -100,8 +100,8 @@ static inline void wg_reset_packet(struct sk_buff *skb) skb->dev = NULL; #ifdef CONFIG_NET_SCHED skb->tc_index = 0; - skb_reset_tc(skb); #endif + skb_reset_redirect(skb); skb->hdr_len = skb_headroom(skb); skb_reset_mac_header(skb); skb_reset_network_header(skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5b50278c4bc8..e59620234415 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -645,8 +645,8 @@ typedef unsigned char *sk_buff_data_t; * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device * @tc_at_ingress: used within tc_classify to distinguish in/egress - * @tc_redirected: packet was redirected by a tc action - * @tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect + * @redirected: packet was redirected by packet classifier + * @from_ingress: packet was redirected from the ingress path * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag @@ -848,8 +848,10 @@ struct sk_buff { #ifdef CONFIG_NET_CLS_ACT __u8 tc_skip_classify:1; __u8 tc_at_ingress:1; - __u8 tc_redirected:1; - __u8 tc_from_ingress:1; +#endif +#ifdef CONFIG_NET_REDIRECT + __u8 redirected:1; + __u8 from_ingress:1; #endif #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; @@ -4579,5 +4581,31 @@ static inline __wsum lco_csum(struct sk_buff *skb) return csum_partial(l4_hdr, csum_start - l4_hdr, partial); } +static inline bool skb_is_redirected(const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_REDIRECT + return skb->redirected; +#else + return false; +#endif +} + +static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) +{ +#ifdef CONFIG_NET_REDIRECT + skb->redirected = 1; + skb->from_ingress = from_ingress; + if (skb->from_ingress) + skb->tstamp = 0; +#endif +} + +static inline void skb_reset_redirect(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_REDIRECT + skb->redirected = 0; +#endif +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 151208704ed2..c30f914867e6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -675,22 +675,6 @@ void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); int skb_do_redirect(struct sk_buff *); -static inline void skb_reset_tc(struct sk_buff *skb) -{ -#ifdef CONFIG_NET_CLS_ACT - skb->tc_redirected = 0; -#endif -} - -static inline bool skb_is_tc_redirected(const struct sk_buff *skb) -{ -#ifdef CONFIG_NET_CLS_ACT - return skb->tc_redirected; -#else - return false; -#endif -} - static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { #ifdef CONFIG_NET_CLS_ACT diff --git a/net/Kconfig b/net/Kconfig index 2eeb0e55f7c9..df8d8c9bd021 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -52,6 +52,9 @@ config NET_INGRESS config NET_EGRESS bool +config NET_REDIRECT + bool + config SKB_EXTENSIONS bool diff --git a/net/core/dev.c b/net/core/dev.c index 402a986659cf..500bba8874b0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4516,7 +4516,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_is_tc_redirected(skb)) + if (skb_is_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom @@ -5063,7 +5063,7 @@ skip_taps: goto out; } #endif - skb_reset_tc(skb); + skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index acc849df60b5..d0641bba6b81 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3362,7 +3362,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* skb was 'freed' by stack, so clean few * bits and reuse it */ - skb_reset_tc(skb); + skb_reset_redirect(skb); } while (--burst > 0); goto out; /* Skips xmit_mode M_START_XMIT */ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 74f050ba6bad..3087e23297db 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -28,9 +28,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; - /* These are used by ifb only. */ - pkt->skb->tc_redirected = 1; - pkt->skb->tc_from_ingress = 1; + /* This is used by ifb only. */ + skb_set_redirected(pkt->skb, true); nf_fwd_netdev_egress(pkt, oif); regs->verdict.code = NF_STOLEN; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1ad300e6dbc0..83dd82fc9f40 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -284,10 +284,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* mirror is always swallowed */ if (is_redirect) { - skb2->tc_redirected = 1; - skb2->tc_from_ingress = skb2->tc_at_ingress; - if (skb2->tc_from_ingress) - skb2->tstamp = 0; + skb_set_redirected(skb2, skb2->tc_at_ingress); + /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; -- cgit v1.2.3 From a0761a301746ec2d92d7fcb82af69c0a6a4339aa Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Mar 2020 15:09:42 +0200 Subject: mac80211: drop data frames without key on encrypted links If we know that we have an encrypted link (based on having had a key configured for TX in the past) then drop all data frames in the key selection handler if there's no key anymore. This fixes an issue with mac80211 internal TXQs - there we can buffer frames for an encrypted link, but then if the key is no longer there when they're dequeued, the frames are sent without encryption. This happens if a station is disconnected while the frames are still on the TXQ. Detecting that a link should be encrypted based on a first key having been configured for TX is fine as there are no use cases for a connection going from with encryption to no encryption. With extended key IDs, however, there is a case of having a key configured for only decryption, so we can't just trigger this behaviour on a key being configured. Cc: stable@vger.kernel.org Reported-by: Jouni Malinen Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.6865c7f28a14.I9fb1d911b064262d33e33dfba730cdeef83926ca@changeid Signed-off-by: Johannes Berg --- net/mac80211/debugfs_sta.c | 3 ++- net/mac80211/key.c | 20 ++++++++++++-------- net/mac80211/sta_info.h | 1 + net/mac80211/tx.c | 12 +++++++++--- 4 files changed, 24 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index c80b1e163ea4..3419ed66c7b0 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -5,7 +5,7 @@ * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include @@ -78,6 +78,7 @@ static const char * const sta_flag_names[] = { FLAG(MPSP_OWNER), FLAG(MPSP_RECIPIENT), FLAG(PS_DELIVER), + FLAG(USES_ENCRYPTION), #undef FLAG }; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 0f889b919b06..efc1acc6543c 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -6,7 +6,7 @@ * Copyright 2007-2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright 2018-2019 Intel Corporation + * Copyright 2018-2020 Intel Corporation */ #include @@ -262,22 +262,29 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) sta ? sta->sta.addr : bcast_addr, ret); } -int ieee80211_set_tx_key(struct ieee80211_key *key) +static int _ieee80211_set_tx_key(struct ieee80211_key *key, bool force) { struct sta_info *sta = key->sta; struct ieee80211_local *local = key->local; assert_key_lock(local); + set_sta_flag(sta, WLAN_STA_USES_ENCRYPTION); + sta->ptk_idx = key->conf.keyidx; - if (!ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) + if (force || !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); return 0; } +int ieee80211_set_tx_key(struct ieee80211_key *key) +{ + return _ieee80211_set_tx_key(key, false); +} + static void ieee80211_pairwise_rekey(struct ieee80211_key *old, struct ieee80211_key *new) { @@ -441,11 +448,8 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, if (pairwise) { rcu_assign_pointer(sta->ptk[idx], new); if (new && - !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)) { - sta->ptk_idx = idx; - clear_sta_flag(sta, WLAN_STA_BLOCK_BA); - ieee80211_check_fast_xmit(sta); - } + !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)) + _ieee80211_set_tx_key(new, true); } else { rcu_assign_pointer(sta->gtk[idx], new); } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index c00e28585f9d..552eed36faca 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -98,6 +98,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_MPSP_OWNER, WLAN_STA_MPSP_RECIPIENT, WLAN_STA_PS_DELIVER, + WLAN_STA_USES_ENCRYPTION, NUM_WLAN_STA_FLAGS, }; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 87def9cb91ff..7dbfb9e3cd84 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -590,10 +590,13 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; - if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) + if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) { tx->key = NULL; - else if (tx->sta && - (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx]))) + return TX_CONTINUE; + } + + if (tx->sta && + (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx]))) tx->key = key; else if (ieee80211_is_group_privacy_action(tx->skb) && (key = rcu_dereference(tx->sdata->default_multicast_key))) @@ -654,6 +657,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (!skip_hw && tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) info->control.hw_key = &tx->key->conf; + } else if (!ieee80211_is_mgmt(hdr->frame_control) && tx->sta && + test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) { + return TX_DROP; } return TX_CONTINUE; -- cgit v1.2.3 From 05dcb8bb258575a8dd3499d0d78bd2db633c2b23 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 26 Mar 2020 15:09:43 +0200 Subject: cfg80211: Do not warn on same channel at the end of CSA When cfg80211_update_assoc_bss_entry() is called, there is a verification that the BSS channel actually changed. As some APs use CSA also for bandwidth changes, this would result with a kernel warning. Fix this by removing the WARN_ON(). Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.96316ada0e8d.I6710376b1b4257e5f4712fc7ab16e2b638d512aa@changeid Signed-off-by: Johannes Berg --- net/wireless/scan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index aef240fdf8df..328402ab64a3 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2022,7 +2022,11 @@ void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, spin_lock_bh(&rdev->bss_lock); - if (WARN_ON(cbss->pub.channel == chan)) + /* + * Some APs use CSA also for bandwidth changes, i.e., without actually + * changing the control channel, so no need to update in such a case. + */ + if (cbss->pub.channel == chan) goto done; /* use transmitting bss */ -- cgit v1.2.3 From ce2e1ca703071723ca2dd94d492a5ab6d15050da Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 26 Mar 2020 15:51:34 +0100 Subject: mac80211: Check port authorization in the ieee80211_tx_dequeue() case mac80211 used to check port authorization in the Data frame enqueue case when going through start_xmit(). However, that authorization status may change while the frame is waiting in a queue. Add a similar check in the dequeue case to avoid sending previously accepted frames after authorization change. This provides additional protection against potential leaking of frames after a station has been disconnected and the keys for it are being removed. Cc: stable@vger.kernel.org Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200326155133.ced84317ea29.I34d4c47cd8cc8a4042b38a76f16a601fbcbfd9b3@changeid Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 7dbfb9e3cd84..455eb8e6a459 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3604,8 +3604,25 @@ begin: tx.skb = skb; tx.sdata = vif_to_sdata(info->control.vif); - if (txq->sta) + if (txq->sta) { tx.sta = container_of(txq->sta, struct sta_info, sta); + /* + * Drop unicast frames to unauthorised stations unless they are + * EAPOL frames from the local station. + */ + if (unlikely(!ieee80211_vif_is_mesh(&tx.sdata->vif) && + tx.sdata->vif.type != NL80211_IFTYPE_OCB && + !is_multicast_ether_addr(hdr->addr1) && + !test_sta_flag(tx.sta, WLAN_STA_AUTHORIZED) && + (!(info->control.flags & + IEEE80211_TX_CTRL_PORT_CTRL_PROTO) || + !ether_addr_equal(tx.sdata->vif.addr, + hdr->addr2)))) { + I802_DEBUG_INC(local->tx_handlers_drop_unauth_port); + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } + } /* * The key can be removed while the packet was queued, so need to call -- cgit v1.2.3 From b16798f5b907733966fd1a558fca823b3c67e4a1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Mar 2020 15:51:35 +0100 Subject: mac80211: mark station unauthorized before key removal If a station is still marked as authorized, mark it as no longer so before removing its keys. This allows frames transmitted to it to be rejected, providing additional protection against leaking plain text data during the disconnection flow. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200326155133.ccb4fb0bb356.If48f0f0504efdcf16b8921f48c6d3bb2cb763c99@changeid Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 0f5f40678885..e3572be307d6 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include @@ -1049,6 +1049,11 @@ static void __sta_info_destroy_part2(struct sta_info *sta) might_sleep(); lockdep_assert_held(&local->sta_mtx); + while (sta->sta_state == IEEE80211_STA_AUTHORIZED) { + ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); + WARN_ON_ONCE(ret); + } + /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); -- cgit v1.2.3 From b95d2ccd2ccb834394d50347d0e40dc38a954e4a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Mar 2020 15:53:34 +0100 Subject: mac80211: set IEEE80211_TX_CTRL_PORT_CTRL_PROTO for nl80211 TX When a frame is transmitted via the nl80211 TX rather than as a normal frame, IEEE80211_TX_CTRL_PORT_CTRL_PROTO wasn't set and this will lead to wrong decisions (rate control etc.) being made about the frame; fix this. Fixes: 911806491425 ("mac80211: Add support for tx_control_port") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200326155333.f183f52b02f0.I4054e2a8c11c2ddcb795a0103c87be3538690243@changeid Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 455eb8e6a459..d9cca6dbd870 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018, 2020 Intel Corporation * * Transmit and frame generation functions. */ @@ -5149,6 +5149,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ethhdr *ehdr; + u32 ctrl_flags = 0; u32 flags; /* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE @@ -5158,6 +5159,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, proto != cpu_to_be16(ETH_P_PREAUTH)) return -EINVAL; + if (proto == sdata->control_port_protocol) + ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; + if (unencrypted) flags = IEEE80211_TX_INTFL_DONT_ENCRYPT; else @@ -5183,7 +5187,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, skb_reset_mac_header(skb); local_bh_disable(); - __ieee80211_subif_start_xmit(skb, skb->dev, flags, 0); + __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags); local_bh_enable(); return 0; -- cgit v1.2.3