From 01f83d69844d307be2aa6fea88b0e8fe5cbdb2f4 Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Wed, 15 Sep 2010 10:27:52 -0700 Subject: tcp: Prevent overzealous packetization by SWS logic. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If peer uses tiny MSS (say, 75 bytes) and similarly tiny advertised window, the SWS logic will packetize to half the MSS unnecessarily. This causes problems with some embedded devices. However for large MSS devices we do want to half-MSS packetize otherwise we never get enough packets into the pipe for things like fast retransmit and recovery to work. Be careful also to handle the case where MSS > window, otherwise we'll never send until the probe timer. Reported-by: ツ Leandro Melo de Sales Signed-off-by: David S. Miller --- include/net/tcp.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index eaa9582779d0..3e4b33e36602 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -475,8 +475,22 @@ extern unsigned int tcp_current_mss(struct sock *sk); /* Bound MSS / TSO packet size with the half of the window */ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) { - if (tp->max_window && pktsize > (tp->max_window >> 1)) - return max(tp->max_window >> 1, 68U - tp->tcp_header_len); + int cutoff; + + /* When peer uses tiny windows, there is no use in packetizing + * to sub-MSS pieces for the sake of SWS or making sure there + * are enough packets in the pipe for fast recovery. + * + * On the other hand, for extremely large MSS devices, handling + * smaller than MSS windows in this way does make sense. + */ + if (tp->max_window >= 512) + cutoff = (tp->max_window >> 1); + else + cutoff = tp->max_window; + + if (cutoff && pktsize > cutoff) + return max_t(int, cutoff, 68U - tp->tcp_header_len); else return pktsize; } -- cgit v1.2.3 From f0f9deae9e7c421fa0c1c627beb8e174325e1ba7 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 17 Sep 2010 16:55:03 -0700 Subject: netpoll: Disable IRQ around RCU dereference in netpoll_rx We cannot use rcu_dereference_bh safely in netpoll_rx as we may be called with IRQs disabled. We could however simply disable IRQs as that too causes BH to be disabled and is safe in either case. Thanks to John Linville for discovering this bug and providing a patch. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netpoll.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 791d5109f34c..50d8009be86c 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,20 +63,20 @@ static inline bool netpoll_rx(struct sk_buff *skb) unsigned long flags; bool ret = false; - rcu_read_lock_bh(); + local_irq_save(flags); npinfo = rcu_dereference_bh(skb->dev->npinfo); if (!npinfo || (list_empty(&npinfo->rx_np) && !npinfo->rx_flags)) goto out; - spin_lock_irqsave(&npinfo->rx_lock, flags); + spin_lock(&npinfo->rx_lock); /* check rx_flags again with the lock held */ if (npinfo->rx_flags && __netpoll_rx(skb)) ret = true; - spin_unlock_irqrestore(&npinfo->rx_lock, flags); + spin_unlock(&npinfo->rx_lock); out: - rcu_read_unlock_bh(); + local_irq_restore(flags); return ret; } -- cgit v1.2.3 From 8444cf712c5f71845cba9dc30d8f530ff0d5ff83 Mon Sep 17 00:00:00 2001 From: Thomas Egerer Date: Mon, 20 Sep 2010 11:11:38 -0700 Subject: xfrm: Allow different selector family in temporary state The family parameter xfrm_state_find is used to find a state matching a certain policy. This value is set to the template's family (encap_family) right before xfrm_state_find is called. The family parameter is however also used to construct a temporary state in xfrm_state_find itself which is wrong for inter-family scenarios because it produces a selector for the wrong family. Since this selector is included in the xfrm_user_acquire structure, user space programs misinterpret IPv6 addresses as IPv4 and vice versa. This patch splits up the original init_tempsel function into a part that initializes the selector respectively the props and id of the temporary state, to allow for differing ip address families whithin the state. Signed-off-by: Thomas Egerer Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/xfrm.h | 4 ++-- net/ipv4/xfrm4_state.c | 33 +++++++++++++++++++-------------- net/ipv6/xfrm6_state.c | 33 +++++++++++++++++++-------------- net/xfrm/xfrm_policy.c | 5 ++--- net/xfrm/xfrm_state.c | 45 +++++++++++++++++++++++++++------------------ 5 files changed, 69 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fc8f36dd0f5c..4f53532d4c2f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -298,8 +298,8 @@ struct xfrm_state_afinfo { const struct xfrm_type *type_map[IPPROTO_MAX]; struct xfrm_mode *mode_map[XFRM_MODE_MAX]; int (*init_flags)(struct xfrm_state *x); - void (*init_tempsel)(struct xfrm_state *x, struct flowi *fl, - struct xfrm_tmpl *tmpl, + void (*init_tempsel)(struct xfrm_selector *sel, struct flowi *fl); + void (*init_temprop)(struct xfrm_state *x, struct xfrm_tmpl *tmpl, xfrm_address_t *daddr, xfrm_address_t *saddr); int (*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n); int (*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 1ef1366a0a03..47947624eccc 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x) } static void -__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, - struct xfrm_tmpl *tmpl, - xfrm_address_t *daddr, xfrm_address_t *saddr) +__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) +{ + sel->daddr.a4 = fl->fl4_dst; + sel->saddr.a4 = fl->fl4_src; + sel->dport = xfrm_flowi_dport(fl); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET; + sel->prefixlen_d = 32; + sel->prefixlen_s = 32; + sel->proto = fl->proto; + sel->ifindex = fl->oif; +} + +static void +xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr) { - x->sel.daddr.a4 = fl->fl4_dst; - x->sel.saddr.a4 = fl->fl4_src; - x->sel.dport = xfrm_flowi_dport(fl); - x->sel.dport_mask = htons(0xffff); - x->sel.sport = xfrm_flowi_sport(fl); - x->sel.sport_mask = htons(0xffff); - x->sel.family = AF_INET; - x->sel.prefixlen_d = 32; - x->sel.prefixlen_s = 32; - x->sel.proto = fl->proto; - x->sel.ifindex = fl->oif; x->id = tmpl->id; if (x->id.daddr.a4 == 0) x->id.daddr.a4 = daddr->a4; @@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .owner = THIS_MODULE, .init_flags = xfrm4_init_flags, .init_tempsel = __xfrm4_init_tempsel, + .init_temprop = xfrm4_init_temprop, .output = xfrm4_output, .extract_input = xfrm4_extract_input, .extract_output = xfrm4_extract_output, diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index f417b77fa0e1..a67575d472a3 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -20,23 +20,27 @@ #include static void -__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, - struct xfrm_tmpl *tmpl, - xfrm_address_t *daddr, xfrm_address_t *saddr) +__xfrm6_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) { /* Initialize temporary selector matching only * to current session. */ - ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst); - ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src); - x->sel.dport = xfrm_flowi_dport(fl); - x->sel.dport_mask = htons(0xffff); - x->sel.sport = xfrm_flowi_sport(fl); - x->sel.sport_mask = htons(0xffff); - x->sel.family = AF_INET6; - x->sel.prefixlen_d = 128; - x->sel.prefixlen_s = 128; - x->sel.proto = fl->proto; - x->sel.ifindex = fl->oif; + ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl->fl6_dst); + ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl->fl6_src); + sel->dport = xfrm_flowi_dport(fl); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET6; + sel->prefixlen_d = 128; + sel->prefixlen_s = 128; + sel->proto = fl->proto; + sel->ifindex = fl->oif; +} + +static void +xfrm6_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr) +{ x->id = tmpl->id; if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); @@ -168,6 +172,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .eth_proto = htons(ETH_P_IPV6), .owner = THIS_MODULE, .init_tempsel = __xfrm6_init_tempsel, + .init_temprop = xfrm6_init_temprop, .tmpl_sort = __xfrm6_tmpl_sort, .state_sort = __xfrm6_state_sort, .output = xfrm6_output, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2b3ed7ad4933..cbab6e1a8c9c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1175,9 +1175,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl, tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; - family = tmpl->encap_family; - if (xfrm_addr_any(local, family)) { - error = xfrm_get_saddr(net, &tmp, remote, family); + if (xfrm_addr_any(local, tmpl->encap_family)) { + error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family); if (error) goto fail; local = &tmp; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 5208b12fbfb4..eb96ce52f178 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -656,15 +656,23 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) EXPORT_SYMBOL(xfrm_sad_getinfo); static int -xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl, - struct xfrm_tmpl *tmpl, - xfrm_address_t *daddr, xfrm_address_t *saddr, - unsigned short family) +xfrm_init_tempstate(struct xfrm_state *x, struct flowi *fl, + struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr, + unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); if (!afinfo) return -1; - afinfo->init_tempsel(x, fl, tmpl, daddr, saddr); + afinfo->init_tempsel(&x->sel, fl); + + if (family != tmpl->encap_family) { + xfrm_state_put_afinfo(afinfo); + afinfo = xfrm_state_get_afinfo(tmpl->encap_family); + if (!afinfo) + return -1; + } + afinfo->init_temprop(x, tmpl, daddr, saddr); xfrm_state_put_afinfo(afinfo); return 0; } @@ -790,37 +798,38 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, int error = 0; struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; + unsigned short encap_family = tmpl->encap_family; to_put = NULL; spin_lock_bh(&xfrm_state_lock); - h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, family); + h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { - if (x->props.family == family && + if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && !(x->props.flags & XFRM_STATE_WILDRECV) && - xfrm_state_addr_check(x, daddr, saddr, family) && + xfrm_state_addr_check(x, daddr, saddr, encap_family) && tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) - xfrm_state_look_at(pol, x, fl, family, daddr, saddr, + xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr, &best, &acquire_in_progress, &error); } if (best) goto found; - h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family); + h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) { - if (x->props.family == family && + if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && !(x->props.flags & XFRM_STATE_WILDRECV) && - xfrm_state_addr_check(x, daddr, saddr, family) && + xfrm_state_addr_check(x, daddr, saddr, encap_family) && tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) - xfrm_state_look_at(pol, x, fl, family, daddr, saddr, + xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr, &best, &acquire_in_progress, &error); } @@ -829,7 +838,7 @@ found: if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi, - tmpl->id.proto, family)) != NULL) { + tmpl->id.proto, encap_family)) != NULL) { to_put = x0; error = -EEXIST; goto out; @@ -839,9 +848,9 @@ found: error = -ENOMEM; goto out; } - /* Initialize temporary selector matching only + /* Initialize temporary state matching only * to current session. */ - xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family); + xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); memcpy(&x->mark, &pol->mark, sizeof(x->mark)); error = security_xfrm_state_alloc_acquire(x, pol->security, fl->secid); @@ -856,10 +865,10 @@ found: x->km.state = XFRM_STATE_ACQ; list_add(&x->km.all, &net->xfrm.state_all); hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); - h = xfrm_src_hash(net, daddr, saddr, family); + h = xfrm_src_hash(net, daddr, saddr, encap_family); hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); if (x->id.spi) { - h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, family); + h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; -- cgit v1.2.3 From 56b49f4b8f6728b91d10c556c116175051b77b60 Mon Sep 17 00:00:00 2001 From: Ollie Wild Date: Wed, 22 Sep 2010 05:54:54 +0000 Subject: net: Move "struct net" declaration inside the __KERNEL__ macro guard This patch reduces namespace pollution by moving the "struct net" declaration out of the userspace-facing portion of linux/netlink.h. It has no impact on the kernel. (This came up because we have several C++ applications which use "net" as a namespace name.) Signed-off-by: Ollie Wild Signed-off-by: David S. Miller --- include/linux/netlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 59d066936ab9..123566912d73 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -27,8 +27,6 @@ #define MAX_LINKS 32 -struct net; - struct sockaddr_nl { sa_family_t nl_family; /* AF_NETLINK */ unsigned short nl_pad; /* zero */ @@ -151,6 +149,8 @@ struct nlattr { #include #include +struct net; + static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) { return (struct nlmsghdr *)skb->data; -- cgit v1.2.3 From 693019e90ca45d881109d32c0c6d29adf03f6447 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 23 Sep 2010 11:19:54 +0000 Subject: net: reset skb queue mapping when rx'ing over tunnel Reset queue mapping when an skb is reentering the stack via a tunnel. On second pass, the queue mapping from the original device is no longer valid. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 81d1413a8701..02386505033d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -242,6 +242,7 @@ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev) dev->stats.rx_packets++; dev->stats.rx_bytes += skb->len; skb->rxhash = 0; + skb_set_queue_mapping(skb, 0); skb_dst_drop(skb); nf_reset(skb); } -- cgit v1.2.3 From 2cc6d2bf3d6195fabcf0febc192c01f99519a8f3 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 24 Sep 2010 09:55:52 +0000 Subject: ipv6: add a missing unregister_pernet_subsys call Clean up a missing exit path in the ipv6 module init routines. In addrconf_init we call ipv6_addr_label_init which calls register_pernet_subsys for the ipv6_addr_label_ops structure. But if module loading fails, or if the ipv6 module is removed, there is no corresponding unregister_pernet_subsys call, which leaves a now-bogus address on the pernet_list, leading to oopses in subsequent registrations. This patch cleans up both the failed load path and the unload path. Tested by myself with good results. Signed-off-by: Neil Horman include/net/addrconf.h | 1 + net/ipv6/addrconf.c | 11 ++++++++--- net/ipv6/addrlabel.c | 5 +++++ 3 files changed, 14 insertions(+), 3 deletions(-) Signed-off-by: David S. Miller --- include/net/addrconf.h | 1 + net/ipv6/addrconf.c | 11 ++++++++--- net/ipv6/addrlabel.c | 5 +++++ 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 45375b41a2a0..4d40c4d0230b 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -121,6 +121,7 @@ static inline int addrconf_finite_timeout(unsigned long timeout) * IPv6 Address Label subsystem (addrlabel.c) */ extern int ipv6_addr_label_init(void); +extern void ipv6_addr_label_cleanup(void); extern void ipv6_addr_label_rtnl_register(void); extern u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ab70a3fbcafa..324fac3b6c16 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4637,10 +4637,12 @@ int __init addrconf_init(void) if (err < 0) { printk(KERN_CRIT "IPv6 Addrconf:" " cannot initialize default policy table: %d.\n", err); - return err; + goto out; } - register_pernet_subsys(&addrconf_ops); + err = register_pernet_subsys(&addrconf_ops); + if (err < 0) + goto out_addrlabel; /* The addrconf netdev notifier requires that loopback_dev * has it's ipv6 private information allocated and setup @@ -4692,7 +4694,9 @@ errout: unregister_netdevice_notifier(&ipv6_dev_notf); errlo: unregister_pernet_subsys(&addrconf_ops); - +out_addrlabel: + ipv6_addr_label_cleanup(); +out: return err; } @@ -4703,6 +4707,7 @@ void addrconf_cleanup(void) unregister_netdevice_notifier(&ipv6_dev_notf); unregister_pernet_subsys(&addrconf_ops); + ipv6_addr_label_cleanup(); rtnl_lock(); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index f0e774cea386..8175f802651b 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -393,6 +393,11 @@ int __init ipv6_addr_label_init(void) return register_pernet_subsys(&ipv6_addr_label_ops); } +void ipv6_addr_label_cleanup(void) +{ + unregister_pernet_subsys(&ipv6_addr_label_ops); +} + static const struct nla_policy ifal_policy[IFAL_MAX+1] = { [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), }, [IFAL_LABEL] = { .len = sizeof(u32), }, -- cgit v1.2.3