From 071d36bf21bcc837be00cea55bcef8d129e7f609 Mon Sep 17 00:00:00 2001 From: "subashab@codeaurora.org" Date: Wed, 23 Mar 2016 22:39:50 -0600 Subject: xfrm: Fix crash observed during device unregistration and decryption A crash is observed when a decrypted packet is processed in receive path. get_rps_cpus() tries to dereference the skb->dev fields but it appears that the device is freed from the poison pattern. [] get_rps_cpu+0x94/0x2f0 [] netif_rx_internal+0x140/0x1cc [] netif_rx+0x74/0x94 [] xfrm_input+0x754/0x7d0 [] xfrm_input_resume+0x10/0x1c [] esp_input_done+0x20/0x30 [] process_one_work+0x244/0x3fc [] worker_thread+0x2f8/0x418 [] kthread+0xe0/0xec -013|get_rps_cpu( | dev = 0xFFFFFFC08B688000, | skb = 0xFFFFFFC0C76AAC00 -> ( | dev = 0xFFFFFFC08B688000 -> ( | name = "...................................................... | name_hlist = (next = 0xAAAAAAAAAAAAAAAA, pprev = 0xAAAAAAAAAAA Following are the sequence of events observed - - Encrypted packet in receive path from netdevice is queued - Encrypted packet queued for decryption (asynchronous) - Netdevice brought down and freed - Packet is decrypted and returned through callback in esp_input_done - Packet is queued again for process in network stack using netif_rx Since the device appears to have been freed, the dereference of skb->dev in get_rps_cpus() leads to an unhandled page fault exception. Fix this by holding on to device reference when queueing packets asynchronously and releasing the reference on call back return. v2: Make the change generic to xfrm as mentioned by Steffen and update the title to xfrm Suggested-by: Herbert Xu Signed-off-by: Jerome Stanislaus Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- net/xfrm/xfrm_input.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index ad7f5b3f9b61..1c4ad477ce93 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -292,12 +292,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; skb_dst_force(skb); + dev_hold(skb->dev); nexthdr = x->type->input(x, skb); if (nexthdr == -EINPROGRESS) return 0; resume: + dev_put(skb->dev); + spin_lock(&x->lock); if (nexthdr <= 0) { if (nexthdr == -EBADMSG) { -- cgit v1.2.3 From 3e347660488818070bff7533f8561928e09e1d65 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 24 Mar 2016 16:50:00 +0100 Subject: switchdev: fix typo in comments/doc Two minor typo. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 2 +- net/switchdev/switchdev.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index fad63136ee3e..2f659129694b 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -386,7 +386,7 @@ used. First phase is to "prepare" anything needed, including various checks, memory allocation, etc. The goal is to handle the stuff that is not unlikely to fail here. The second phase is to "commit" the actual changes. -Switchdev provides an inftrastructure for sharing items (for example memory +Switchdev provides an infrastructure for sharing items (for example memory allocations) between the two phases. The object created by a driver in "prepare" phase and it is queued up by: diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 8b5833c1ff2e..2b9b98f1c2ff 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1079,7 +1079,7 @@ nla_put_failure: * @filter_dev: filter device * @idx: * - * Delete FDB entry from switch device. + * Dump FDB entries from switch device. */ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, -- cgit v1.2.3 From 543e3a8da5a4c453e992d5351ef405d5e32f27d7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 24 Mar 2016 21:56:21 -0500 Subject: netpoll: Fix extra refcount release in netpoll_cleanup() netpoll_setup() does a dev_hold() on np->dev, the netpoll device. If it fails, it correctly does a dev_put() but leaves np->dev set. If we call netpoll_cleanup() after the failure, np->dev is still set so we do another dev_put(), which decrements the refcount an extra time. It's questionable to call netpoll_cleanup() after netpoll_setup() fails, but it can be difficult to find the problem, and we can easily avoid it in this case. The extra decrements can lead to hangs like this: unregister_netdevice: waiting for bond0 to become free. Usage count = -3 Set and clear np->dev at the points where we dev_hold() and dev_put() the device. Signed-off-by: Bjorn Helgaas Signed-off-by: David S. Miller --- net/core/netpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 94acfc89ad97..a57bd17805b4 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -603,7 +603,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) const struct net_device_ops *ops; int err; - np->dev = ndev; strlcpy(np->dev_name, ndev->name, IFNAMSIZ); INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); @@ -670,6 +669,7 @@ int netpoll_setup(struct netpoll *np) goto unlock; } dev_hold(ndev); + np->dev = ndev; if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); @@ -770,6 +770,7 @@ int netpoll_setup(struct netpoll *np) return 0; put: + np->dev = NULL; dev_put(ndev); unlock: rtnl_unlock(); -- cgit v1.2.3 From 995096a0a438396a570104e9c240a2a26623a5a1 Mon Sep 17 00:00:00 2001 From: Quentin Armitage Date: Sun, 27 Mar 2016 17:06:11 +0100 Subject: Fix returned tc and hoplimit values for route with IPv6 encapsulation For a route with IPv6 encapsulation, the traffic class and hop limit values are interchanged when returned to userspace by the kernel. For example, see below. ># ip route add 192.168.0.1 dev eth0.2 encap ip6 dst 0x50 tc 0x50 hoplimit 100 table 1000 ># ip route show table 1000 192.168.0.1 encap ip6 id 0 src :: dst fe83::1 hoplimit 80 tc 100 dev eth0.2 scope link Signed-off-by: Quentin Armitage Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 02dd990af542..6165f30c4d72 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -372,8 +372,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) || nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || - nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || - nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || + nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; -- cgit v1.2.3 From ac71b46efd2838c02ec193987c8f61c3ba33b495 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Mon, 28 Mar 2016 18:08:59 +0800 Subject: openvswitch: Use proper buffer size in nla_memcpy For the input parameter count, it's better to use the size of destination buffer size, as nla_memcpy would take into account the length of the source netlink attribute when a data is copied from an attribute. Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index dc5eb29fe7d6..f8a8d4390a8a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -968,7 +968,8 @@ static int parse_nat(const struct nlattr *attr, break; case OVS_NAT_ATTR_IP_MIN: - nla_memcpy(&info->range.min_addr, a, nla_len(a)); + nla_memcpy(&info->range.min_addr, a, + sizeof(info->range.min_addr)); info->range.flags |= NF_NAT_RANGE_MAP_IPS; break; -- cgit v1.2.3 From 596cf3fe5854fe2b1703b0466ed6bf9cfb83c91e Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Wed, 16 Mar 2016 21:49:00 +0100 Subject: netfilter: ipset: fix race condition in ipset save, swap and delete This fix adds a new reference counter (ref_netlink) for the struct ip_set. The other reference counter (ref) can be swapped out by ip_set_swap and we need a separate counter to keep track of references for netlink events like dump. Using the same ref counter for dump causes a race condition which can be demonstrated by the following script: ipset create hash_ip1 hash:ip family inet hashsize 1024 maxelem 500000 \ counters ipset create hash_ip2 hash:ip family inet hashsize 300000 maxelem 500000 \ counters ipset create hash_ip3 hash:ip family inet hashsize 1024 maxelem 500000 \ counters ipset save & ipset swap hash_ip3 hash_ip2 ipset destroy hash_ip3 /* will crash the machine */ Swap will exchange the values of ref so destroy will see ref = 0 instead of ref = 1. With this fix in place swap will not succeed because ipset save still has ref_netlink on the set (ip_set_swap doesn't swap ref_netlink). Both delete and swap will error out if ref_netlink != 0 on the set. Note: The changes to *_head functions is because previously we would increment ref whenever we called these functions, we don't do that anymore. Reviewed-by: Joshua Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 4 ++++ net/netfilter/ipset/ip_set_bitmap_gen.h | 2 +- net/netfilter/ipset/ip_set_core.c | 33 ++++++++++++++++++++++++++++----- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- net/netfilter/ipset/ip_set_list_set.c | 2 +- 5 files changed, 35 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 0e1f433cc4b7..f48b8a664b0f 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -234,6 +234,10 @@ struct ip_set { spinlock_t lock; /* References to the set */ u32 ref; + /* References to the set for netlink events like dump, + * ref can be swapped out by ip_set_swap + */ + u32 ref_netlink; /* The core set type */ struct ip_set_type *type; /* The type variant doing the real job */ diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index b0bc475f641e..2e8e7e5fb4a6 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -95,7 +95,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 7e6568cad494..a748b0c2c981 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -497,6 +497,26 @@ __ip_set_put(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } +/* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need + * a separate reference counter + */ +static inline void +__ip_set_get_netlink(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + set->ref_netlink++; + write_unlock_bh(&ip_set_ref_lock); +} + +static inline void +__ip_set_put_netlink(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + BUG_ON(set->ref_netlink == 0); + set->ref_netlink--; + write_unlock_bh(&ip_set_ref_lock); +} + /* Add, del and test set entries from kernel. * * The set behind the index must exist and must be referenced @@ -1002,7 +1022,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s && s->ref) { + if (s && (s->ref || s->ref_netlink)) { ret = -IPSET_ERR_BUSY; goto out; } @@ -1024,7 +1044,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, if (!s) { ret = -ENOENT; goto out; - } else if (s->ref) { + } else if (s->ref || s->ref_netlink) { ret = -IPSET_ERR_BUSY; goto out; } @@ -1171,6 +1191,9 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; + if (from->ref_netlink || to->ref_netlink) + return -EBUSY; + strncpy(from_name, from->name, IPSET_MAXNAMELEN); strncpy(from->name, to->name, IPSET_MAXNAMELEN); strncpy(to->name, from_name, IPSET_MAXNAMELEN); @@ -1206,7 +1229,7 @@ ip_set_dump_done(struct netlink_callback *cb) if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); - __ip_set_put_byindex(inst, index); + __ip_set_put_netlink(set); } return 0; } @@ -1328,7 +1351,7 @@ dump_last: if (!cb->args[IPSET_CB_ARG0]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); - set->ref++; + set->ref_netlink++; } write_unlock_bh(&ip_set_ref_lock); nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, @@ -1396,7 +1419,7 @@ release_refcount: if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); - __ip_set_put_byindex(inst, index); + __ip_set_put_netlink(set); cb->args[IPSET_CB_ARG0] = 0; } out: diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index e5336ab36d67..d32fd6b036bf 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1082,7 +1082,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) goto nla_put_failure; #endif - if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 24c6c1962aea..a2a89e4e0a14 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -458,7 +458,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(sizeof(*map) + n * set->dsize))) goto nla_put_failure; -- cgit v1.2.3 From 5745b0be05a0f8ccbc92a36b69f3a6bc58e91954 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Mon, 21 Mar 2016 11:15:19 -0700 Subject: openvswitch: Fix checking for new expected connections. OVS should call into CT NAT for packets of new expected connections only when the conntrack state is persisted with the 'commit' option to the OVS CT action. The test for this condition is doubly wrong, as the CT status field is ANDed with the bit number (IPS_EXPECTED_BIT) rather than the mask (IPS_EXPECTED), and due to the wrong assumption that the expected bit would apply only for the first (i.e., 'new') packet of a connection, while in fact the expected bit remains on for the lifetime of an expected connection. The 'ctinfo' value IP_CT_RELATED derived from the ct status can be used instead, as it is only ever applicable to the 'new' packets of the expected connection. Fixes: 05752523e565 ('openvswitch: Interface with NAT.') Reported-by: Dan Carpenter Signed-off-by: Jarno Rajahalme Signed-off-by: Pablo Neira Ayuso --- net/openvswitch/conntrack.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index dc5eb29fe7d6..47f7c62761d2 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -664,11 +664,12 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, /* Determine NAT type. * Check if the NAT type can be deduced from the tracked connection. - * Make sure expected traffic is NATted only when committing. + * Make sure new expected connections (IP_CT_RELATED) are NATted only + * when committing. */ if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW && ct->status & IPS_NAT_MASK && - (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) { + (ctinfo != IP_CT_RELATED || info->commit)) { /* NAT an established or related connection like before. */ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) /* This is the REPLY direction for a connection -- cgit v1.2.3 From 99b7248e2ad57ca93ada10c6598affb267ffc99a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 18 Mar 2016 14:33:45 +0100 Subject: openvswitch: call only into reachable nf-nat code The openvswitch code has gained support for calling into the nf-nat-ipv4/ipv6 modules, however those can be loadable modules in a configuration in which openvswitch is built-in, leading to link errors: net/built-in.o: In function `__ovs_ct_lookup': :(.text+0x2cc2c8): undefined reference to `nf_nat_icmp_reply_translation' :(.text+0x2cc66c): undefined reference to `nf_nat_icmpv6_reply_translation' The dependency on (!NF_NAT || NF_NAT) prevents similar issues, but NF_NAT is set to 'y' if any of the symbols selecting it are built-in, but the link error happens when any of them are modular. A second issue is that even if CONFIG_NF_NAT_IPV6 is built-in, CONFIG_NF_NAT_IPV4 might be completely disabled. This is unlikely to be useful in practice, but the driver currently only handles IPv6 being optional. This patch improves the Kconfig dependency so that openvswitch cannot be built-in if either of the two other symbols are set to 'm', and it replaces the incorrect #ifdef in ovs_ct_nat_execute() with two "if (IS_ENABLED())" checks that should catch all corner cases also make the code more readable. The same #ifdef exists ovs_ct_nat_to_attr(), where it does not cause a link error, but for consistency I'm changing it the same way. Signed-off-by: Arnd Bergmann Fixes: 05752523e565 ("openvswitch: Interface with NAT.") Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- net/openvswitch/Kconfig | 4 +++- net/openvswitch/conntrack.c | 16 ++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 234a73344c6e..ce947292ae77 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -7,7 +7,9 @@ config OPENVSWITCH depends on INET depends on !NF_CONNTRACK || \ (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \ - (!NF_NAT || NF_NAT))) + (!NF_NAT || NF_NAT) && \ + (!NF_NAT_IPV4 || NF_NAT_IPV4) && \ + (!NF_NAT_IPV6 || NF_NAT_IPV6))) select LIBCRC32C select MPLS select NET_MPLS_GSO diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 47f7c62761d2..3797879b0bf8 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -535,14 +535,15 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: - if (skb->protocol == htons(ETH_P_IP) && + if (IS_ENABLED(CONFIG_NF_NAT_IPV4) && + skb->protocol == htons(ETH_P_IP) && ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, hooknum)) err = NF_DROP; goto push; -#if IS_ENABLED(CONFIG_NF_NAT_IPV6) - } else if (skb->protocol == htons(ETH_P_IPV6)) { + } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) && + skb->protocol == htons(ETH_P_IPV6)) { __be16 frag_off; u8 nexthdr = ipv6_hdr(skb)->nexthdr; int hdrlen = ipv6_skip_exthdr(skb, @@ -557,7 +558,6 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, err = NF_DROP; goto push; } -#endif } /* Non-ICMP, fall thru to initialize if needed. */ case IP_CT_NEW: @@ -1239,7 +1239,8 @@ static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, } if (info->range.flags & NF_NAT_RANGE_MAP_IPS) { - if (info->family == NFPROTO_IPV4) { + if (IS_ENABLED(CONFIG_NF_NAT_IPV4) && + info->family == NFPROTO_IPV4) { if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN, info->range.min_addr.ip) || (info->range.max_addr.ip @@ -1247,8 +1248,8 @@ static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX, info->range.max_addr.ip)))) return false; -#if IS_ENABLED(CONFIG_NF_NAT_IPV6) - } else if (info->family == NFPROTO_IPV6) { + } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) && + info->family == NFPROTO_IPV6) { if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN, &info->range.min_addr.in6) || (memcmp(&info->range.max_addr.in6, @@ -1257,7 +1258,6 @@ static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX, &info->range.max_addr.in6)))) return false; -#endif } else { return false; } -- cgit v1.2.3 From bdf533de6968e9686df777dc178486f600c6e617 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:49 +0100 Subject: netfilter: x_tables: validate e->target_offset early We should check that e->target_offset is sane before mark_source_chains gets called since it will fetch the target entry for loop detection. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 17 ++++++++--------- net/ipv4/netfilter/ip_tables.c | 17 ++++++++--------- net/ipv6/netfilter/ip6_tables.c | 17 ++++++++--------- 3 files changed, 24 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index bf081927e06b..830bbe8ec13d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -474,14 +474,12 @@ next: return 1; } -static inline int check_entry(const struct arpt_entry *e, const char *name) +static inline int check_entry(const struct arpt_entry *e) { const struct xt_entry_target *t; - if (!arp_checkentry(&e->arp)) { - duprintf("arp_tables: arp check failed %p %s.\n", e, name); + if (!arp_checkentry(&e->arp)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; @@ -522,10 +520,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) struct xt_target *target; int ret; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -576,6 +570,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { @@ -590,6 +585,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_ARP_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1246,7 +1245,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct arpt_entry *)e, name); + ret = check_entry((struct arpt_entry *)e); if (ret) return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e53f8d6f326d..1d72a3c4a7e7 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -569,14 +569,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ipt_entry *e, const char *name) +check_entry(const struct ipt_entry *e) { const struct xt_entry_target *t; - if (!ip_checkentry(&e->ip)) { - duprintf("ip check failed %p %s.\n", e, name); + if (!ip_checkentry(&e->ip)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -666,10 +664,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -741,6 +735,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { @@ -755,6 +750,10 @@ check_entry_size_and_hooks(struct ipt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1506,7 +1505,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ipt_entry *)e, name); + ret = check_entry((struct ipt_entry *)e); if (ret) return ret; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 84f9baf7aee8..26a5ad1cc4fd 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -581,14 +581,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ip6t_entry *e, const char *name) +check_entry(const struct ip6t_entry *e) { const struct xt_entry_target *t; - if (!ip6_checkentry(&e->ipv6)) { - duprintf("ip_tables: ip check failed %p %s.\n", e, name); + if (!ip6_checkentry(&e->ipv6)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -679,10 +677,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -753,6 +747,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { @@ -767,6 +762,10 @@ check_entry_size_and_hooks(struct ip6t_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1518,7 +1517,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ip6t_entry *)e, name); + ret = check_entry((struct ip6t_entry *)e); if (ret) return ret; -- cgit v1.2.3 From 6e94e0cfb0887e4013b3b930fa6ab1fe6bb6ba91 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:50 +0100 Subject: netfilter: x_tables: make sure e->next_offset covers remaining blob size Otherwise this function may read data beyond the ruleset blob. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 6 ++++-- net/ipv4/netfilter/ip_tables.c | 6 ++++-- net/ipv6/netfilter/ip6_tables.c | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 830bbe8ec13d..51d4fe56b807 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -573,7 +573,8 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1232,7 +1233,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 1d72a3c4a7e7..fb7694e6663e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -738,7 +738,8 @@ check_entry_size_and_hooks(struct ipt_entry *e, int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1492,7 +1493,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 26a5ad1cc4fd..b248528f2a17 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -750,7 +750,8 @@ check_entry_size_and_hooks(struct ip6t_entry *e, int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || - (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1504,7 +1505,8 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } -- cgit v1.2.3 From 54d83fc74aa9ec72794373cb47432c5f7fb1a309 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:52 +0100 Subject: netfilter: x_tables: fix unconditional helper Ben Hawkes says: In the mark_source_chains function (net/ipv4/netfilter/ip_tables.c) it is possible for a user-supplied ipt_entry structure to have a large next_offset field. This field is not bounds checked prior to writing a counter value at the supplied offset. Problem is that mark_source_chains should not have been called -- the rule doesn't have a next entry, so its supposed to return an absolute verdict of either ACCEPT or DROP. However, the function conditional() doesn't work as the name implies. It only checks that the rule is using wildcard address matching. However, an unconditional rule must also not be using any matches (no -m args). The underflow validator only checked the addresses, therefore passing the 'unconditional absolute verdict' test, while mark_source_chains also tested for presence of matches, and thus proceeeded to the next (not-existent) rule. Unify this so that all the callers have same idea of 'unconditional rule'. Reported-by: Ben Hawkes Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 18 +++++++++--------- net/ipv4/netfilter/ip_tables.c | 23 +++++++++++------------ net/ipv6/netfilter/ip6_tables.c | 23 +++++++++++------------ 3 files changed, 31 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 51d4fe56b807..a1bb5e7129a2 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -359,11 +359,12 @@ unsigned int arpt_do_table(struct sk_buff *skb, } /* All zeroes == unconditional rule. */ -static inline bool unconditional(const struct arpt_arp *arp) +static inline bool unconditional(const struct arpt_entry *e) { static const struct arpt_arp uncond; - return memcmp(arp, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct arpt_entry) && + memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } /* Figures out from what hook each rule can be called: returns 0 if @@ -402,11 +403,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo, |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct arpt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->arp)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -551,7 +551,7 @@ static bool check_underflow(const struct arpt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->arp)) + if (!unconditional(e)) return false; t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -598,9 +598,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index fb7694e6663e..89b5d959eda3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -168,11 +168,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ipt_ip *ip) +static inline bool unconditional(const struct ipt_entry *e) { static const struct ipt_ip uncond; - return memcmp(ip, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ipt_entry) && + memcmp(&e->ip, &uncond, sizeof(uncond)) == 0; #undef FWINV } @@ -229,11 +230,10 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ipt_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ip)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP_TRACE_COMMENT_POLICY] @@ -476,11 +476,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ipt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->ip)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -715,7 +714,7 @@ static bool check_underflow(const struct ipt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ip)) + if (!unconditional(e)) return false; t = ipt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -763,9 +762,9 @@ check_entry_size_and_hooks(struct ipt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index b248528f2a17..541b59f83595 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -198,11 +198,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ip6t_ip6 *ipv6) +static inline bool unconditional(const struct ip6t_entry *e) { static const struct ip6t_ip6 uncond; - return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ip6t_entry) && + memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0; } static inline const struct xt_entry_target * @@ -258,11 +259,10 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ip6t_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ipv6)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP6_TRACE_COMMENT_POLICY] @@ -488,11 +488,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ip6t_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && - unconditional(&e->ipv6)) || visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -727,7 +726,7 @@ static bool check_underflow(const struct ip6t_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ipv6)) + if (!unconditional(e)) return false; t = ip6t_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -775,9 +774,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; -- cgit v1.2.3 From 931401137f60fc299256bbc221c0b756be31c32c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 23 Mar 2016 12:58:01 +0100 Subject: netfilter: nfnetlink_queue: honor NFQA_CFG_F_FAIL_OPEN when netlink unicast fails When netlink unicast fails to deliver the message to userspace, we should also check if the NFQA_CFG_F_FAIL_OPEN flag is set so we reinject the packet back to the stack. I think the user expects no packet drops when this flag is set due to queueing to userspace errors, no matter if related to the internal queue or when sending the netlink message to userspace. The userspace application will still get the ENOBUFS error via recvmsg() so the user still knows that, with the current configuration that is in place, the userspace application is not consuming the messages at the pace that the kernel needs. Reported-by: "Yigal Reiss (yreiss)" Signed-off-by: Pablo Neira Ayuso Tested-by: "Yigal Reiss (yreiss)" --- net/netfilter/nfnetlink_queue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 75429997ed41..cb5b630a645b 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -582,7 +582,12 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, /* nfnetlink_unicast will either free the nskb or add it to a socket */ err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); if (err < 0) { - queue->queue_user_dropped++; + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { + failopen = 1; + err = 0; + } else { + queue->queue_user_dropped++; + } goto err_out_unlock; } -- cgit v1.2.3 From b301f2538759933cf9ff1f7c4f968da72e3f0757 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 24 Mar 2016 21:29:53 +0100 Subject: netfilter: x_tables: enforce nul-terminated table name from getsockopt GET_ENTRIES Make sure the table names via getsockopt GET_ENTRIES is nul-terminated in ebtables and all the x_tables variants and their respective compat code. Uncovered by KASAN. Reported-by: Baozeng Ding Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 4 ++++ net/ipv4/netfilter/arp_tables.c | 2 ++ net/ipv4/netfilter/ip_tables.c | 2 ++ net/ipv6/netfilter/ip6_tables.c | 2 ++ 4 files changed, 10 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 67b2e27999aa..8570bc7744c2 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1521,6 +1521,8 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; + tmp.name[sizeof(tmp.name) - 1] = '\0'; + t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); if (!t) return ret; @@ -2332,6 +2334,8 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; + tmp.name[sizeof(tmp.name) - 1] = '\0'; + t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); if (!t) return ret; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index a1bb5e7129a2..4133b0f513af 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -969,6 +969,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, sizeof(struct arpt_get_entries) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR_OR_NULL(t)) { @@ -1663,6 +1664,7 @@ static int compat_get_entries(struct net *net, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 89b5d959eda3..631c100a1338 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1156,6 +1156,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR_OR_NULL(t)) { @@ -1935,6 +1936,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 541b59f83595..86b67b70b626 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1168,6 +1168,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR_OR_NULL(t)) { @@ -1944,6 +1945,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); -- cgit v1.2.3 From 29421198c3a860092e27c2ad8499dfe603398817 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 26 Mar 2016 16:32:57 +0800 Subject: netfilter: ipv4: fix NULL dereference Commit fa50d974d104 ("ipv4: Namespaceify ip_default_ttl sysctl knob") use sock_net(skb->sk) to get the net namespace, but we can't assume that sk_buff->sk is always exist, so when it is NULL, oops will happen. Signed-off-by: Liping Zhang Reviewed-by: Nikolay Borisov Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_reject_bridge.c | 20 ++++++------ net/ipv4/netfilter/ipt_SYNPROXY.c | 54 +++++++++++++++++--------------- 2 files changed, 38 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index adc8d7221dbb..77f7e7a9ebe1 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -40,7 +40,8 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, /* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT) * or the bridge port (NF_BRIDGE PREROUTING). */ -static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, +static void nft_reject_br_send_v4_tcp_reset(struct net *net, + struct sk_buff *oldskb, const struct net_device *dev, int hook) { @@ -48,7 +49,6 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, struct iphdr *niph; const struct tcphdr *oth; struct tcphdr _oth; - struct net *net = sock_net(oldskb->sk); if (!nft_bridge_iphdr_validate(oldskb)) return; @@ -75,7 +75,8 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, br_deliver(br_port_get_rcu(dev), nskb); } -static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, +static void nft_reject_br_send_v4_unreach(struct net *net, + struct sk_buff *oldskb, const struct net_device *dev, int hook, u8 code) { @@ -86,7 +87,6 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, void *payload; __wsum csum; u8 proto; - struct net *net = sock_net(oldskb->sk); if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb)) return; @@ -273,17 +273,17 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, case htons(ETH_P_IP): switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nft_reject_br_send_v4_unreach(pkt->skb, pkt->in, - pkt->hook, + nft_reject_br_send_v4_unreach(pkt->net, pkt->skb, + pkt->in, pkt->hook, priv->icmp_code); break; case NFT_REJECT_TCP_RST: - nft_reject_br_send_v4_tcp_reset(pkt->skb, pkt->in, - pkt->hook); + nft_reject_br_send_v4_tcp_reset(pkt->net, pkt->skb, + pkt->in, pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: - nft_reject_br_send_v4_unreach(pkt->skb, pkt->in, - pkt->hook, + nft_reject_br_send_v4_unreach(pkt->net, pkt->skb, + pkt->in, pkt->hook, nft_reject_icmp_code(priv->icmp_code)); break; } diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 7b8fbb352877..db5b87509446 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -18,10 +18,10 @@ #include static struct iphdr * -synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) +synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, + __be32 daddr) { struct iphdr *iph; - struct net *net = sock_net(skb->sk); skb_reset_network_header(skb); iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); @@ -40,14 +40,12 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) } static void -synproxy_send_tcp(const struct synproxy_net *snet, +synproxy_send_tcp(struct net *net, const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct iphdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { - struct net *net = nf_ct_net(snet->tmpl); - nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; @@ -72,7 +70,7 @@ free_nskb: } static void -synproxy_send_client_synack(const struct synproxy_net *snet, +synproxy_send_client_synack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { @@ -91,7 +89,7 @@ synproxy_send_client_synack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -109,15 +107,16 @@ synproxy_send_client_synack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static void -synproxy_send_server_syn(const struct synproxy_net *snet, +synproxy_send_server_syn(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts, u32 recv_seq) { + struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; @@ -132,7 +131,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -153,12 +152,12 @@ synproxy_send_server_syn(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } static void -synproxy_send_server_ack(const struct synproxy_net *snet, +synproxy_send_server_ack(struct net *net, const struct ip_ct_tcp *state, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) @@ -177,7 +176,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -193,11 +192,11 @@ synproxy_send_server_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void -synproxy_send_client_ack(const struct synproxy_net *snet, +synproxy_send_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { @@ -215,7 +214,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -231,15 +230,16 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static bool -synproxy_recv_client_ack(const struct synproxy_net *snet, +synproxy_recv_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct synproxy_options *opts, u32 recv_seq) { + struct synproxy_net *snet = synproxy_pernet(net); int mss; mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); @@ -255,7 +255,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet, if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_check_timestamp_cookie(opts); - synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + synproxy_send_server_syn(net, skb, th, opts, recv_seq); return true; } @@ -263,7 +263,8 @@ static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct synproxy_net *snet = synproxy_pernet(par->net); + struct net *net = par->net; + struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; @@ -292,12 +293,12 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(snet, skb, th, &opts); + synproxy_send_client_synack(net, skb, th, &opts); return NF_DROP; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq)); return NF_DROP; } @@ -308,7 +309,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(nhs->net); + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; @@ -365,7 +367,7 @@ static unsigned int ipv4_synproxy_hook(void *priv, * therefore we need to add 1 to make the SYN sequence * number match the one of first SYN. */ - if (synproxy_recv_client_ack(snet, skb, th, &opts, + if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq) + 1)) this_cpu_inc(snet->stats->cookie_retrans); @@ -391,12 +393,12 @@ static unsigned int ipv4_synproxy_hook(void *priv, XT_SYNPROXY_OPT_SACK_PERM); swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(snet, state, skb, th, &opts); + synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(snet, skb, th, &opts); + synproxy_send_client_ack(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; -- cgit v1.2.3 From 5e263f712691615fb802f06c98d7638c378f5d11 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Tue, 29 Mar 2016 18:48:08 +0800 Subject: bridge: Allow set bridge ageing time when switchdev disabled When NET_SWITCHDEV=n, switchdev_port_attr_set will return -EOPNOTSUPP, we should ignore this error code and continue to set the ageing time. Fixes: c62987bbd8a1 ("bridge: push bridge setting ageing_time down to switchdev") Signed-off-by: Haishuang Yan Acked-by: Ido Schimmel Signed-off-by: David S. Miller --- net/bridge/br_stp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index e23449094188..9cb7044d0801 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -582,7 +582,7 @@ int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) int err; err = switchdev_port_attr_set(br->dev, &attr); - if (err) + if (err && err != -EOPNOTSUPP) return err; br->ageing_time = t; -- cgit v1.2.3 From 28fd34985bd5edaea6282a0586f91bcddd336f5e Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 29 Mar 2016 10:41:03 -0300 Subject: sctp: really allow using GFP_KERNEL on sctp_packet_transmit Somehow my patch for commit cea8768f333e ("sctp: allow sctp_transmit_packet and others to use gfp") missed two important chunks, which are now added. Fixes: cea8768f333e ("sctp: allow sctp_transmit_packet and others to use gfp") Signed-off-by: Marcelo Ricardo Leitner Acked-By: Neil Horman Signed-off-by: David S. Miller --- net/sctp/output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/output.c b/net/sctp/output.c index 736c004abfbc..97745351d58c 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -401,7 +401,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) sk = chunk->skb->sk; /* Allocate the new skb. */ - nskb = alloc_skb(packet->size + MAX_HEADER, GFP_ATOMIC); + nskb = alloc_skb(packet->size + MAX_HEADER, gfp); if (!nskb) goto nomem; @@ -523,8 +523,8 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) */ if (auth) sctp_auth_calculate_hmac(asoc, nskb, - (struct sctp_auth_chunk *)auth, - GFP_ATOMIC); + (struct sctp_auth_chunk *)auth, + gfp); /* 2) Calculate the Adler-32 checksum of the whole packet, * including the SCTP common header and all the -- cgit v1.2.3 From c3483384ee511ee2af40b4076366cd82a6a47b86 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 29 Mar 2016 14:55:22 -0700 Subject: gro: Allow tunnel stacking in the case of FOU/GUE This patch should fix the issues seen with a recent fix to prevent tunnel-in-tunnel frames from being generated with GRO. The fix itself is correct for now as long as we do not add any devices that support NETIF_F_GSO_GRE_CSUM. When such a device is added it could have the potential to mess things up due to the fact that the outer transport header points to the outer UDP header and not the GRE header as would be expected. Fixes: fac8e0f579695 ("tunnels: Don't apply GRO to multiple layers of encapsulation.") Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fou.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index a0586b4a197d..5a94aea280d3 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -195,6 +195,14 @@ static struct sk_buff **fou_gro_receive(struct sk_buff **head, u8 proto = NAPI_GRO_CB(skb)->proto; const struct net_offload **offloads; + /* We can clear the encap_mark for FOU as we are essentially doing + * one of two possible things. We are either adding an L4 tunnel + * header to the outer L3 tunnel header, or we are are simply + * treating the GRE tunnel header as though it is a UDP protocol + * specific header such as VXLAN or GENEVE. + */ + NAPI_GRO_CB(skb)->encap_mark = 0; + rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); @@ -352,6 +360,14 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, } } + /* We can clear the encap_mark for GUE as we are essentially doing + * one of two possible things. We are either adding an L4 tunnel + * header to the outer L3 tunnel header, or we are are simply + * treating the GRE tunnel header as though it is a UDP protocol + * specific header such as VXLAN or GENEVE. + */ + NAPI_GRO_CB(skb)->encap_mark = 0; + rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[guehdr->proto_ctype]); -- cgit v1.2.3 From 2d4212261fdf13e29728ddb5ea9d60c342cc92b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Mar 2016 08:43:41 -0700 Subject: ipv6: udp: fix UDP_MIB_IGNOREDMULTI updates IPv6 counters updates use a different macro than IPv4. Fixes: 36cbb2452cbaf ("udp: Increment UDP_MIB_IGNOREDMULTI for arriving unmatched multicasts") Signed-off-by: Eric Dumazet Cc: Rick Jones Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv6/udp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index fd25e447a5fa..8125931106be 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -843,8 +843,8 @@ start_lookup: flush_stack(stack, count, skb, count - 1); } else { if (!inner_flushed) - UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); + UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); consume_skb(skb); } return 0; -- cgit v1.2.3 From c0e760c9c66ebd8a5a1ede81868677f4df993dfb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Mar 2016 00:02:00 +0200 Subject: bpf: make padding in bpf_tunnel_key explicit Make the 2 byte padding in struct bpf_tunnel_key between tunnel_ttl and tunnel_label members explicit. No issue has been observed, and gcc/llvm does padding for the old struct already, where tunnel_label was not yet present, so the current code works, but since it's part of uapi, make sure we don't introduce holes in structs. Therefore, add tunnel_ext that we can use generically in future (f.e. to flag OAM messages for backends, etc). Also add the offset to the compat tests to be sure should some compilers not padd the tail of the old version of bpf_tunnel_key. Fixes: 4018ab1875e0 ("bpf: support flow label for bpf_skb_{set, get}_tunnel_key") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 924f537183fd..23917bb47bf3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -375,6 +375,7 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; + __u16 tunnel_ext; __u32 tunnel_label; }; diff --git a/net/core/filter.c b/net/core/filter.c index b7177d01ecb0..4b81b71171b4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1764,6 +1764,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have @@ -1849,6 +1850,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. @@ -1861,7 +1863,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EINVAL; } } - if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label)) + if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || + from->tunnel_ext)) return -EINVAL; skb_dst_drop(skb); -- cgit v1.2.3 From c57c7a95da842807b475b823ed2e5435c42cb3b0 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 31 Mar 2016 18:10:31 +0200 Subject: rtnl: fix msg size calculation in if_nlmsg_size() Size of the attribute IFLA_PHYS_PORT_NAME was missing. Fixes: db24a9044ee1 ("net: add support for phys_port_name") CC: David Ahern Signed-off-by: Nicolas Dichtel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f2066772d0f3..a75f7e94b445 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -909,6 +909,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } -- cgit v1.2.3 From 5a5abb1fa3b05dd6aa821525832644c1e7d2905f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 31 Mar 2016 02:13:18 +0200 Subject: tun, bpf: fix suspicious RCU usage in tun_{attach, detach}_filter Sasha Levin reported a suspicious rcu_dereference_protected() warning found while fuzzing with trinity that is similar to this one: [ 52.765684] net/core/filter.c:2262 suspicious rcu_dereference_protected() usage! [ 52.765688] other info that might help us debug this: [ 52.765695] rcu_scheduler_active = 1, debug_locks = 1 [ 52.765701] 1 lock held by a.out/1525: [ 52.765704] #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x17/0x20 [ 52.765721] stack backtrace: [ 52.765728] CPU: 1 PID: 1525 Comm: a.out Not tainted 4.5.0+ #264 [...] [ 52.765768] Call Trace: [ 52.765775] [] dump_stack+0x85/0xc8 [ 52.765784] [] lockdep_rcu_suspicious+0xd5/0x110 [ 52.765792] [] sk_detach_filter+0x82/0x90 [ 52.765801] [] tun_detach_filter+0x35/0x90 [tun] [ 52.765810] [] __tun_chr_ioctl+0x354/0x1130 [tun] [ 52.765818] [] ? selinux_file_ioctl+0x130/0x210 [ 52.765827] [] tun_chr_ioctl+0x13/0x20 [tun] [ 52.765834] [] do_vfs_ioctl+0x96/0x690 [ 52.765843] [] ? security_file_ioctl+0x43/0x60 [ 52.765850] [] SyS_ioctl+0x79/0x90 [ 52.765858] [] do_syscall_64+0x62/0x140 [ 52.765866] [] entry_SYSCALL64_slow_path+0x25/0x25 Same can be triggered with PROVE_RCU (+ PROVE_RCU_REPEATEDLY) enabled from tun_attach_filter() when user space calls ioctl(tun_fd, TUN{ATTACH, DETACH}FILTER, ...) for adding/removing a BPF filter on tap devices. Since the fix in f91ff5b9ff52 ("net: sk_{detach|attach}_filter() rcu fixes") sk_attach_filter()/sk_detach_filter() now dereferences the filter with rcu_dereference_protected(), checking whether socket lock is held in control path. Since its introduction in 994051625981 ("tun: socket filter support"), tap filters are managed under RTNL lock from __tun_chr_ioctl(). Thus the sock_owned_by_user(sk) doesn't apply in this specific case and therefore triggers the false positive. Extend the BPF API with __sk_attach_filter()/__sk_detach_filter() pair that is used by tap filters and pass in lockdep_rtnl_is_held() for the rcu_dereference_protected() checks instead. Reported-by: Sasha Levin Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- drivers/net/tun.c | 8 +++++--- include/linux/filter.h | 4 ++++ net/core/filter.c | 33 +++++++++++++++++++++------------ 3 files changed, 30 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index afdf950617c3..510e90a6bb26 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -622,7 +622,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte /* Re-attach the filter to persist device */ if (!skip_filter && (tun->filter_attached == true)) { - err = sk_attach_filter(&tun->fprog, tfile->socket.sk); + err = __sk_attach_filter(&tun->fprog, tfile->socket.sk, + lockdep_rtnl_is_held()); if (!err) goto out; } @@ -1822,7 +1823,7 @@ static void tun_detach_filter(struct tun_struct *tun, int n) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); - sk_detach_filter(tfile->socket.sk); + __sk_detach_filter(tfile->socket.sk, lockdep_rtnl_is_held()); } tun->filter_attached = false; @@ -1835,7 +1836,8 @@ static int tun_attach_filter(struct tun_struct *tun) for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); - ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); + ret = __sk_attach_filter(&tun->fprog, tfile->socket.sk, + lockdep_rtnl_is_held()); if (ret) { tun_detach_filter(tun, i); return ret; diff --git a/include/linux/filter.h b/include/linux/filter.h index 43aa1f8855c7..a51a5361695f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -465,10 +465,14 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); +int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, + bool locked); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); int sk_detach_filter(struct sock *sk); +int __sk_detach_filter(struct sock *sk, bool locked); + int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); diff --git a/net/core/filter.c b/net/core/filter.c index 4b81b71171b4..ca7f832b2980 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1149,7 +1149,8 @@ void bpf_prog_destroy(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_destroy); -static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) +static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk, + bool locked) { struct sk_filter *fp, *old_fp; @@ -1165,10 +1166,8 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return -ENOMEM; } - old_fp = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); + old_fp = rcu_dereference_protected(sk->sk_filter, locked); rcu_assign_pointer(sk->sk_filter, fp); - if (old_fp) sk_filter_uncharge(sk, old_fp); @@ -1247,7 +1246,8 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) * occurs or there is insufficient memory for the filter a negative * errno code is returned. On success the return is zero. */ -int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, + bool locked) { struct bpf_prog *prog = __get_filter(fprog, sk); int err; @@ -1255,7 +1255,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __sk_attach_prog(prog, sk); + err = __sk_attach_prog(prog, sk, locked); if (err < 0) { __bpf_prog_release(prog); return err; @@ -1263,7 +1263,12 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) return 0; } -EXPORT_SYMBOL_GPL(sk_attach_filter); +EXPORT_SYMBOL_GPL(__sk_attach_filter); + +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk)); +} int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { @@ -1309,7 +1314,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __sk_attach_prog(prog, sk); + err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk)); if (err < 0) { bpf_prog_put(prog); return err; @@ -2250,7 +2255,7 @@ static int __init register_sk_filter_ops(void) } late_initcall(register_sk_filter_ops); -int sk_detach_filter(struct sock *sk) +int __sk_detach_filter(struct sock *sk, bool locked) { int ret = -ENOENT; struct sk_filter *filter; @@ -2258,8 +2263,7 @@ int sk_detach_filter(struct sock *sk) if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; - filter = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); + filter = rcu_dereference_protected(sk->sk_filter, locked); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); @@ -2268,7 +2272,12 @@ int sk_detach_filter(struct sock *sk) return ret; } -EXPORT_SYMBOL_GPL(sk_detach_filter); +EXPORT_SYMBOL_GPL(__sk_detach_filter); + +int sk_detach_filter(struct sock *sk) +{ + return __sk_detach_filter(sk, sock_owned_by_user(sk)); +} int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) -- cgit v1.2.3