From 568af6de058cb2b0c5b98d98ffcf37cdc6bc38a7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 4 Mar 2017 19:53:47 +0100 Subject: netfilter: nf_tables: set pktinfo->thoff at AH header if found Phil Sutter reports that IPv6 AH header matching is broken. From userspace, nft generates bytecode that expects to find the AH header at NFT_PAYLOAD_TRANSPORT_HEADER both for IPv4 and IPv6. However, pktinfo->thoff is set to the inner header after the AH header in IPv6, while in IPv4 pktinfo->thoff points to the AH header indeed. This behaviour is inconsistent. This patch fixes this problem by updating ipv6_find_hdr() to get the IP6_FH_F_AUTH flag so this function stops at the AH header, so both IPv4 and IPv6 pktinfo->thoff point to the AH header. This is also inconsistent when trying to match encapsulated headers: 1) A packet that looks like IPv4 + AH + TCP dport 22 will *not* match. 2) A packet that looks like IPv6 + AH + TCP dport 22 will match. Reported-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv6.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index d150b5066201..97983d1c05e4 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -9,12 +9,13 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { + unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; nft_set_pktinfo(pkt, skb, state); - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) { nft_set_pktinfo_proto_unspec(pkt, skb); return; @@ -32,6 +33,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_IPV6) + unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; @@ -50,7 +52,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, if (pkt_len + sizeof(*ip6h) > skb->len) return -1; - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) return -1; -- cgit v1.2.3 From 10596608c4d62cb8c1c2b806debcbd32fe657e71 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 8 Mar 2017 22:54:18 +0800 Subject: netfilter: nf_tables: fix mismatch in big-endian system Currently, there are two different methods to store an u16 integer to the u32 data register. For example: u32 *dest = ®s->data[priv->dreg]; 1. *dest = 0; *(u16 *) dest = val_u16; 2. *dest = val_u16; For method 1, the u16 value will be stored like this, either in big-endian or little-endian system: 0 15 31 +-+-+-+-+-+-+-+-+-+-+-+-+ | Value | 0 | +-+-+-+-+-+-+-+-+-+-+-+-+ For method 2, in little-endian system, the u16 value will be the same as listed above. But in big-endian system, the u16 value will be stored like this: 0 15 31 +-+-+-+-+-+-+-+-+-+-+-+-+ | 0 | Value | +-+-+-+-+-+-+-+-+-+-+-+-+ So later we use "memcmp(®s->data[priv->sreg], data, 2);" to do compare in nft_cmp, nft_lookup expr ..., method 2 will get the wrong result in big-endian system, as 0~15 bits will always be zero. For the similar reason, when loading an u16 value from the u32 data register, we should use "*(u16 *) sreg;" instead of "(u16)*sreg;", the 2nd method will get the wrong value in the big-endian system. So introduce some wrapper functions to store/load an u8 or u16 integer to/from the u32 data register, and use them in the right place. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 29 +++++++++++++++++++++++++++ net/ipv4/netfilter/nft_masq_ipv4.c | 8 ++++---- net/ipv4/netfilter/nft_redir_ipv4.c | 8 ++++---- net/ipv6/netfilter/nft_masq_ipv6.c | 8 ++++---- net/ipv6/netfilter/nft_redir_ipv6.c | 8 ++++---- net/netfilter/nft_ct.c | 18 +++++++++-------- net/netfilter/nft_meta.c | 40 +++++++++++++++++++------------------ net/netfilter/nft_nat.c | 8 ++++---- 8 files changed, 80 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2aa8a9d80fbe..70c5ca0c60b1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -103,6 +103,35 @@ struct nft_regs { }; }; +/* Store/load an u16 or u8 integer to/from the u32 data register. + * + * Note, when using concatenations, register allocation happens at 32-bit + * level. So for store instruction, pad the rest part with zero to avoid + * garbage values. + */ + +static inline void nft_reg_store16(u32 *dreg, u16 val) +{ + *dreg = 0; + *(u16 *)dreg = val; +} + +static inline void nft_reg_store8(u32 *dreg, u8 val) +{ + *dreg = 0; + *(u8 *)dreg = val; +} + +static inline u16 nft_reg_load16(u32 *sreg) +{ + return *(u16 *)sreg; +} + +static inline u8 nft_reg_load8(u32 *sreg) +{ + return *(u8 *)sreg; +} + static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index a0ea8aad1bf1..f18677277119 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -26,10 +26,10 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); } regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt), &range, nft_out(pkt)); diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index 1650ed23c15d..5120be1d3118 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -26,10 +26,10 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, memset(&mr, 0, sizeof(mr)); if (priv->sreg_proto_min) { - mr.range[0].min.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - mr.range[0].max.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + mr.range[0].min.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + mr.range[0].max.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 6c5b5b1830a7..4146536e9c15 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -27,10 +27,10 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); } regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, nft_out(pkt)); diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index f5ac080fc084..a27e424f690d 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -26,10 +26,10 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min], - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max], + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index bf548a7a71ec..91585b5e5307 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -83,7 +83,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, switch (priv->key) { case NFT_CT_DIRECTION: - *dest = CTINFO2DIR(ctinfo); + nft_reg_store8(dest, CTINFO2DIR(ctinfo)); return; case NFT_CT_STATUS: *dest = ct->status; @@ -151,20 +151,22 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; } case NFT_CT_L3PROTOCOL: - *dest = nf_ct_l3num(ct); + nft_reg_store8(dest, nf_ct_l3num(ct)); return; case NFT_CT_PROTOCOL: - *dest = nf_ct_protonum(ct); + nft_reg_store8(dest, nf_ct_protonum(ct)); return; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); + u16 zoneid; if (priv->dir < IP_CT_DIR_MAX) - *dest = nf_ct_zone_id(zone, priv->dir); + zoneid = nf_ct_zone_id(zone, priv->dir); else - *dest = zone->id; + zoneid = zone->id; + nft_reg_store16(dest, zoneid); return; } #endif @@ -183,10 +185,10 @@ static void nft_ct_get_eval(const struct nft_expr *expr, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_PROTO_SRC: - *dest = (__force __u16)tuple->src.u.all; + nft_reg_store16(dest, (__force u16)tuple->src.u.all); return; case NFT_CT_PROTO_DST: - *dest = (__force __u16)tuple->dst.u.all; + nft_reg_store16(dest, (__force u16)tuple->dst.u.all); return; default: break; @@ -205,7 +207,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; - u16 value = regs->data[priv->sreg]; + u16 value = nft_reg_load16(®s->data[priv->sreg]); struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index e1f5ca9b423b..7b60e01f38ff 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -45,16 +45,15 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = skb->len; break; case NFT_META_PROTOCOL: - *dest = 0; - *(__be16 *)dest = skb->protocol; + nft_reg_store16(dest, (__force u16)skb->protocol); break; case NFT_META_NFPROTO: - *dest = nft_pf(pkt); + nft_reg_store8(dest, nft_pf(pkt)); break; case NFT_META_L4PROTO: if (!pkt->tprot_set) goto err; - *dest = pkt->tprot; + nft_reg_store8(dest, pkt->tprot); break; case NFT_META_PRIORITY: *dest = skb->priority; @@ -85,14 +84,12 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_IIFTYPE: if (in == NULL) goto err; - *dest = 0; - *(u16 *)dest = in->type; + nft_reg_store16(dest, in->type); break; case NFT_META_OIFTYPE: if (out == NULL) goto err; - *dest = 0; - *(u16 *)dest = out->type; + nft_reg_store16(dest, out->type); break; case NFT_META_SKUID: sk = skb_to_full_sk(skb); @@ -142,19 +139,19 @@ void nft_meta_get_eval(const struct nft_expr *expr, #endif case NFT_META_PKTTYPE: if (skb->pkt_type != PACKET_LOOPBACK) { - *dest = skb->pkt_type; + nft_reg_store8(dest, skb->pkt_type); break; } switch (nft_pf(pkt)) { case NFPROTO_IPV4: if (ipv4_is_multicast(ip_hdr(skb)->daddr)) - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); else - *dest = PACKET_BROADCAST; + nft_reg_store8(dest, PACKET_BROADCAST); break; case NFPROTO_IPV6: - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); break; case NFPROTO_NETDEV: switch (skb->protocol) { @@ -168,14 +165,14 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; if (ipv4_is_multicast(iph->daddr)) - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); else - *dest = PACKET_BROADCAST; + nft_reg_store8(dest, PACKET_BROADCAST); break; } case htons(ETH_P_IPV6): - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); break; default: WARN_ON_ONCE(1); @@ -230,7 +227,9 @@ void nft_meta_set_eval(const struct nft_expr *expr, { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; - u32 value = regs->data[meta->sreg]; + u32 *sreg = ®s->data[meta->sreg]; + u32 value = *sreg; + u8 pkt_type; switch (meta->key) { case NFT_META_MARK: @@ -240,9 +239,12 @@ void nft_meta_set_eval(const struct nft_expr *expr, skb->priority = value; break; case NFT_META_PKTTYPE: - if (skb->pkt_type != value && - skb_pkt_type_ok(value) && skb_pkt_type_ok(skb->pkt_type)) - skb->pkt_type = value; + pkt_type = nft_reg_load8(sreg); + + if (skb->pkt_type != pkt_type && + skb_pkt_type_ok(pkt_type) && + skb_pkt_type_ok(skb->pkt_type)) + skb->pkt_type = pkt_type; break; case NFT_META_NFTRACE: skb->nf_trace = !!value; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 19a7bf3236f9..439e0bd152a0 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -65,10 +65,10 @@ static void nft_nat_eval(const struct nft_expr *expr, } if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } -- cgit v1.2.3 From 170a1fb9c01bc40b7e8fd57a32ac9a0e131ec5b6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sat, 11 Mar 2017 00:25:26 -0500 Subject: netfilter: Force fake conntrack entry to be at least 8 bytes aligned Since the nfct and nfctinfo have been combined, the nf_conn structure must be at least 8 bytes aligned, as the 3 LSB bits are used for the nfctinfo. But there's a fake nf_conn structure to denote untracked connections, which is created by a PER_CPU construct. This does not guarantee that it will be 8 bytes aligned and can break the logic in determining the correct nfctinfo. I triggered this on a 32bit machine with the following error: BUG: unable to handle kernel NULL pointer dereference at 00000af4 IP: nf_ct_deliver_cached_events+0x1b/0xfb *pdpt = 0000000031962001 *pde = 0000000000000000 Oops: 0000 [#1] SMP [Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables ipv6 crc_ccitt ppdev r8169 parport_pc parport OK ] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.10.0-test+ #75 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 task: c126ec00 task.stack: c1258000 EIP: nf_ct_deliver_cached_events+0x1b/0xfb EFLAGS: 00010202 CPU: 0 EAX: 0021cd01 EBX: 00000000 ECX: 27b0c767 EDX: 32bcb17a ESI: f34135c0 EDI: f34135c0 EBP: f2debd60 ESP: f2debd3c DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 80050033 CR2: 00000af4 CR3: 309a0440 CR4: 001406f0 Call Trace: ? ipv6_skip_exthdr+0xac/0xcb ipv6_confirm+0x10c/0x119 [nf_conntrack_ipv6] nf_hook_slow+0x22/0xc7 nf_hook+0x9a/0xad [ipv6] ? ip6t_do_table+0x356/0x379 [ip6_tables] ? ip6_fragment+0x9e9/0x9e9 [ipv6] ip6_output+0xee/0x107 [ipv6] ? ip6_fragment+0x9e9/0x9e9 [ipv6] dst_output+0x36/0x4d [ipv6] NF_HOOK.constprop.37+0xb2/0xba [ipv6] ? icmp6_dst_alloc+0x2c/0xfd [ipv6] ? local_bh_enable+0x14/0x14 [ipv6] mld_sendpack+0x1c5/0x281 [ipv6] ? mark_held_locks+0x40/0x5c mld_ifc_timer_expire+0x1f6/0x21e [ipv6] call_timer_fn+0x135/0x283 ? detach_if_pending+0x55/0x55 ? mld_dad_timer_expire+0x3e/0x3e [ipv6] __run_timers+0x111/0x14b ? mld_dad_timer_expire+0x3e/0x3e [ipv6] run_timer_softirq+0x1c/0x36 __do_softirq+0x185/0x37c ? test_ti_thread_flag.constprop.19+0xd/0xd do_softirq_own_stack+0x22/0x28 irq_exit+0x5a/0xa4 smp_apic_timer_interrupt+0x2a/0x34 apic_timer_interrupt+0x37/0x3c By using DEFINE/DECLARE_PER_CPU_ALIGNED we can enforce at least 8 byte alignment as all cache line sizes are at least 8 bytes or more. Fixes: a9e419dc7be6 ("netfilter: merge ctinfo into nfct pointer storage area") Signed-off-by: Steven Rostedt (VMware) Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 +- net/netfilter/nf_conntrack_core.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index f540f9ad2af4..19605878da47 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -244,7 +244,7 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, u32 seq); /* Fake conntrack entry for untracked connections */ -DECLARE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +DECLARE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); static inline struct nf_conn *nf_ct_untracked_get(void) { return raw_cpu_ptr(&nf_conntrack_untracked); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 071b97fcbefb..ffb78e5f7b70 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -181,7 +181,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; seqcount_t nf_conntrack_generation __read_mostly; -DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +/* nf_conn must be 8 bytes aligned, as the 3 LSB bits are used + * for the nfctinfo. We cheat by (ab)using the PER CPU cache line + * alignment to enforce this. + */ +DEFINE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); static unsigned int nf_conntrack_hash_rnd __read_mostly; -- cgit v1.2.3 From 04166f48d9593af4513ae06c0f966c0cee300a20 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Mar 2017 13:24:03 +0100 Subject: Revert "netfilter: nf_tables: add flush field to struct nft_set_iter" This reverts commit 1f48ff6c5393aa7fe290faf5d633164f105b0aa7. This patch is not required anymore now that we keep a dummy list of set elements in the bitmap set implementation, so revert this before we forget this code has no clients. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 - net/netfilter/nf_tables_api.c | 4 ---- 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 70c5ca0c60b1..0136028652bd 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -232,7 +232,6 @@ struct nft_set_elem { struct nft_set; struct nft_set_iter { u8 genmask; - bool flush; unsigned int count; unsigned int skip; int err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5e0ccfd5bb37..434c739dfeca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3145,7 +3145,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, iter.count = 0; iter.err = 0; iter.fn = nf_tables_bind_check_setelem; - iter.flush = false; set->ops->walk(ctx, set, &iter); if (iter.err < 0) @@ -3399,7 +3398,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.iter.count = 0; args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; - args.iter.flush = false; set->ops->walk(&ctx, set, &args.iter); nla_nest_end(skb, nest); @@ -3963,7 +3961,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct nft_set_iter iter = { .genmask = genmask, .fn = nft_flush_set, - .flush = true, }; set->ops->walk(&ctx, set, &iter); @@ -5114,7 +5111,6 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, iter.count = 0; iter.err = 0; iter.fn = nf_tables_loop_check_setelem; - iter.flush = false; set->ops->walk(ctx, set, &iter); if (iter.err < 0) -- cgit v1.2.3 From 4cbe4dac82e423ecc9a0ba46af24a860853259f4 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 13 Mar 2017 19:29:08 +0200 Subject: net/mlx4_core: Avoid delays during VF driver device shutdown Some Hypervisors detach VFs from VMs by instantly causing an FLR event to be generated for a VF. In the mlx4 case, this will cause that VF's comm channel to be disabled before the VM has an opportunity to invoke the VF device's "shutdown" method. For such Hypervisors, there is a race condition between the VF's shutdown method and its internal-error detection/reset thread. The internal-error detection/reset thread (which runs every 5 seconds) also detects a disabled comm channel. If the internal-error detection/reset flow wins the race, we still get delays (while that flow tries repeatedly to detect comm-channel recovery). The cited commit fixed the command timeout problem when the internal-error detection/reset flow loses the race. This commit avoids the unneeded delays when the internal-error detection/reset flow wins. Fixes: d585df1c5ccf ("net/mlx4_core: Avoid command timeouts during VF driver device shutdown") Signed-off-by: Jack Morgenstein Reported-by: Simon Xiao Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 11 +++++++++++ drivers/net/ethernet/mellanox/mlx4/main.c | 11 +++++++++++ include/linux/mlx4/device.h | 1 + 3 files changed, 23 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index e8c105164931..0e0fa7030565 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -2305,6 +2305,17 @@ static int sync_toggles(struct mlx4_dev *dev) rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)); if (wr_toggle == 0xffffffff || rd_toggle == 0xffffffff) { /* PCI might be offline */ + + /* If device removal has been requested, + * do not continue retrying. + */ + if (dev->persist->interface_state & + MLX4_INTERFACE_STATE_NOWAIT) { + mlx4_warn(dev, + "communication channel is offline\n"); + return -EIO; + } + msleep(100); wr_toggle = swab32(readl(&priv->mfunc.comm-> slave_write)); diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 21377c315083..703205475524 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -1940,6 +1940,14 @@ static int mlx4_comm_check_offline(struct mlx4_dev *dev) (u32)(1 << COMM_CHAN_OFFLINE_OFFSET)); if (!offline_bit) return 0; + + /* If device removal has been requested, + * do not continue retrying. + */ + if (dev->persist->interface_state & + MLX4_INTERFACE_STATE_NOWAIT) + break; + /* There are cases as part of AER/Reset flow that PF needs * around 100 msec to load. We therefore sleep for 100 msec * to allow other tasks to make use of that CPU during this @@ -3955,6 +3963,9 @@ static void mlx4_remove_one(struct pci_dev *pdev) struct devlink *devlink = priv_to_devlink(priv); int active_vfs = 0; + if (mlx4_is_slave(dev)) + persist->interface_state |= MLX4_INTERFACE_STATE_NOWAIT; + mutex_lock(&persist->interface_state_mutex); persist->interface_state |= MLX4_INTERFACE_STATE_DELETION; mutex_unlock(&persist->interface_state_mutex); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7e66e4f62858..1beb1ec2fbdf 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -476,6 +476,7 @@ enum { enum { MLX4_INTERFACE_STATE_UP = 1 << 0, MLX4_INTERFACE_STATE_DELETION = 1 << 1, + MLX4_INTERFACE_STATE_NOWAIT = 1 << 2, }; #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \ -- cgit v1.2.3 From 36d277bac8080202684e67162ebb157f16631581 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:14 +0800 Subject: vsock: track pkt owner vsock So that we can cancel a queued pkt later if necessary. Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 3 +++ net/vmw_vsock/virtio_transport_common.c | 7 +++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 9638bfeb0d1f..584f9a647ad4 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -48,6 +48,8 @@ struct virtio_vsock_pkt { struct virtio_vsock_hdr hdr; struct work_struct work; struct list_head list; + /* socket refcnt not held, only use for cancellation */ + struct vsock_sock *vsk; void *buf; u32 len; u32 off; @@ -56,6 +58,7 @@ struct virtio_vsock_pkt { struct virtio_vsock_pkt_info { u32 remote_cid, remote_port; + struct vsock_sock *vsk; struct msghdr *msg; u32 pkt_len; u16 type; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 8d592a45b597..af087b44ceea 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -58,6 +58,7 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, pkt->len = len; pkt->hdr.len = cpu_to_le32(len); pkt->reply = info->reply; + pkt->vsk = info->vsk; if (info->msg && len > 0) { pkt->buf = kmalloc(len, GFP_KERNEL); @@ -180,6 +181,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk, struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, .type = type, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -519,6 +521,7 @@ int virtio_transport_connect(struct vsock_sock *vsk) struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_REQUEST, .type = VIRTIO_VSOCK_TYPE_STREAM, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -534,6 +537,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | (mode & SEND_SHUTDOWN ? VIRTIO_VSOCK_SHUTDOWN_SEND : 0), + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -560,6 +564,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk, .type = VIRTIO_VSOCK_TYPE_STREAM, .msg = msg, .pkt_len = len, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -581,6 +586,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk, .op = VIRTIO_VSOCK_OP_RST, .type = VIRTIO_VSOCK_TYPE_STREAM, .reply = !!pkt, + .vsk = vsk, }; /* Send RST only if the original pkt is not a RST pkt */ @@ -826,6 +832,7 @@ virtio_transport_send_response(struct vsock_sock *vsk, .remote_cid = le64_to_cpu(pkt->hdr.src_cid), .remote_port = le32_to_cpu(pkt->hdr.src_port), .reply = true, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); -- cgit v1.2.3 From 16320f363ae128d9b9c70e60f00f2a572f57c23d Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:15 +0800 Subject: vhost-vsock: add pkt cancel capability To allow canceling all packets of a connection. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- drivers/vhost/vsock.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/net/af_vsock.h | 3 +++ 2 files changed, 44 insertions(+) (limited to 'include') diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index ce5e63d2c66a..44eed8eb0725 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -223,6 +223,46 @@ vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt) return len; } +static int +vhost_transport_cancel_pkt(struct vsock_sock *vsk) +{ + struct vhost_vsock *vsock; + struct virtio_vsock_pkt *pkt, *n; + int cnt = 0; + LIST_HEAD(freeme); + + /* Find the vhost_vsock according to guest context id */ + vsock = vhost_vsock_get(vsk->remote_addr.svm_cid); + if (!vsock) + return -ENODEV; + + spin_lock_bh(&vsock->send_pkt_list_lock); + list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { + if (pkt->vsk != vsk) + continue; + list_move(&pkt->list, &freeme); + } + spin_unlock_bh(&vsock->send_pkt_list_lock); + + list_for_each_entry_safe(pkt, n, &freeme, list) { + if (pkt->reply) + cnt++; + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + if (cnt) { + struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; + int new_cnt; + + new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); + if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num) + vhost_poll_queue(&tx_vq->poll); + } + + return 0; +} + static struct virtio_vsock_pkt * vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq, unsigned int out, unsigned int in) @@ -675,6 +715,7 @@ static struct virtio_transport vhost_transport = { .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, + .cancel_pkt = vhost_transport_cancel_pkt, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_dequeue = virtio_transport_dgram_dequeue, diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index f2758964ce6f..f32ed9ac181a 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -100,6 +100,9 @@ struct vsock_transport { void (*destruct)(struct vsock_sock *); void (*release)(struct vsock_sock *); + /* Cancel all pending packets sent on vsock. */ + int (*cancel_pkt)(struct vsock_sock *vsk); + /* Connections. */ int (*connect)(struct vsock_sock *); -- cgit v1.2.3 From 1f904495b79003cd3d881de8731377d48fcbc7e3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 18 Mar 2017 19:27:23 +0800 Subject: sctp: define dst_pending_confirm as a bit in sctp_transport As tp->dst_pending_confirm's value can only be set 0 or 1, this patch is to change to define it as a bit instead of __u32. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 07a0b128625a..4f645198e9bd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -753,6 +753,8 @@ struct sctp_transport { /* Is the Path MTU update pending on this tranport */ pmtu_pending:1, + dst_pending_confirm:1, /* need to confirm neighbour */ + /* Has this transport moved the ctsn since we last sacked */ sack_generation:1; u32 dst_cookie; @@ -806,8 +808,6 @@ struct sctp_transport { __u32 burst_limited; /* Holds old cwnd when max.burst is applied */ - __u32 dst_pending_confirm; /* need to confirm neighbour */ - /* Destination */ struct dst_entry *dst; /* Source address. */ -- cgit v1.2.3 From 4ef1b2869447411ad3ef91ad7d4891a83c1a509a Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 18 Mar 2017 17:03:00 -0400 Subject: tcp: mark skbs with SCM_TIMESTAMPING_OPT_STATS SOF_TIMESTAMPING_OPT_STATS can be enabled and disabled while packets are collected on the error queue. So, checking SOF_TIMESTAMPING_OPT_STATS in sk->sk_tsflags is not enough to safely assume that the skb contains OPT_STATS data. Add a bit in sock_exterr_skb to indicate whether the skb contains opt_stats data. Fixes: 1c885808e456 ("tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING") Reported-by: JongHwan Kim Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/errqueue.h | 2 ++ net/core/skbuff.c | 17 +++++++++++------ net/socket.c | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/errqueue.h b/include/linux/errqueue.h index 9ca23fcfb5d7..6fdfc884fdeb 100644 --- a/include/linux/errqueue.h +++ b/include/linux/errqueue.h @@ -20,6 +20,8 @@ struct sock_exterr_skb { struct sock_extended_err ee; u16 addr_offset; __be16 port; + u8 opt_stats:1, + unused:7; }; #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b1fbd1958eb6..9f781092fda9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3793,16 +3793,20 @@ EXPORT_SYMBOL(skb_clone_sk); static void __skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk, - int tstype) + int tstype, + bool opt_stats) { struct sock_exterr_skb *serr; int err; + BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = tstype; + serr->opt_stats = opt_stats; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk->sk_protocol == IPPROTO_TCP && @@ -3843,7 +3847,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, */ if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { *skb_hwtstamps(skb) = *hwtstamps; - __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); } } @@ -3854,7 +3858,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sock *sk, int tstype) { struct sk_buff *skb; - bool tsonly; + bool tsonly, opt_stats = false; if (!sk) return; @@ -3867,9 +3871,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, #ifdef CONFIG_INET if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) + sk->sk_type == SOCK_STREAM) { skb = tcp_get_timestamping_opt_stats(sk); - else + opt_stats = true; + } else #endif skb = alloc_skb(0, GFP_ATOMIC); } else { @@ -3888,7 +3893,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, else skb->tstamp = ktime_get_real(); - __skb_complete_tx_timestamp(skb, sk, tstype); + __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); } EXPORT_SYMBOL_GPL(__skb_tstamp_tx); diff --git a/net/socket.c b/net/socket.c index 692d6989d2c2..985ef06792d6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -706,7 +706,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, SCM_TIMESTAMPING, sizeof(tss), &tss); if (skb_is_err_queue(skb) && skb->len && - (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS)) + SKB_EXT_ERR(skb)->opt_stats) put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, skb->len, skb->data); } -- cgit v1.2.3 From 1511949c61ec63e4b646c34d602ac6990b38ce30 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 20 Mar 2017 17:46:27 +0800 Subject: sctp: declare struct sctp_stream before using it sctp_stream_free uses struct sctp_stream as a param, but struct sctp_stream is defined after it's declaration. This patch is to declare struct sctp_stream before sctp_stream_free. Fixes: a83863174a61 ("sctp: prepare asoc stream for stream reconf") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 4f645198e9bd..592decebac75 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -83,6 +83,7 @@ struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; +struct sctp_stream; #include -- cgit v1.2.3