From 8e8ce08198de193e3d21d42e96945216e3d9ac7f Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 16 Feb 2020 13:02:06 +0100 Subject: batman-adv: Don't schedule OGM for disabled interface A transmission scheduling for an interface which is currently dropped by batadv_iv_ogm_iface_disable could still be in progress. The B.A.T.M.A.N. V is simply cancelling the workqueue item in an synchronous way but this is not possible with B.A.T.M.A.N. IV because the OGM submissions are intertwined. Instead it has to stop submitting the OGM when it detect that the buffer pointer is set to NULL. Reported-by: syzbot+a98f2016f40b9cd3818a@syzkaller.appspotmail.com Reported-by: syzbot+ac36b6a33c28a491e929@syzkaller.appspotmail.com Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Sven Eckelmann Cc: Hillf Danton Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f0209505e41a..a7c8dd7ae513 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -789,6 +789,10 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); + /* interface already disabled by batadv_iv_ogm_iface_disable */ + if (!*ogm_buff) + return; + /* the interface gets activated here to avoid race conditions between * the moment of activating the interface in * hardif_activate_interface() where the originator mac is set and -- cgit v1.2.3 From 9951ebfcdf2b97dbb28a5d930458424341e61aa2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 21 Feb 2020 10:41:43 +0100 Subject: nl80211: fix potential leak in AP start If nl80211_parse_he_obss_pd() fails, we leak the previously allocated ACL memory. Free it in this case. Fixes: 796e90f42b7e ("cfg80211: add support for parsing OBBS_PD attributes") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200221104142.835aba4cdd14.I1923b55ba9989c57e13978f91f40bfdc45e60cbd@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cedf17d4933f..46be40e19e7f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4800,8 +4800,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_he_obss_pd( info->attrs[NL80211_ATTR_HE_OBSS_PD], ¶ms.he_obss_pd); - if (err) - return err; + goto out; } nl80211_calculate_ap_params(¶ms); @@ -4823,6 +4822,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) } wdev_unlock(wdev); +out: kfree(params.acl); return err; -- cgit v1.2.3 From a7ee7d44b57c9ae174088e53a668852b7f4f452d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 21 Feb 2020 10:44:50 +0100 Subject: cfg80211: check reg_rule for NULL in handle_channel_custom() We may end up with a NULL reg_rule after the loop in handle_channel_custom() if the bandwidth didn't fit, check if this is the case and bail out if so. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200221104449.3b558a50201c.I4ad3725c4dacaefd2d18d3cc65ba6d18acd5dbfe@changeid Signed-off-by: Johannes Berg --- net/wireless/reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index fff9a74891fc..1a8218f1bbe0 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2276,7 +2276,7 @@ static void handle_channel_custom(struct wiphy *wiphy, break; } - if (IS_ERR(reg_rule)) { + if (IS_ERR_OR_NULL(reg_rule)) { pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n", chan->center_freq); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { -- cgit v1.2.3 From 0daa63ed4c6c4302790ce67b7a90c0997ceb7514 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Fri, 21 Feb 2020 10:47:20 +0100 Subject: mac80211: Remove a redundant mutex unlock The below-mentioned commit changed the code to unlock *inside* the function, but previously the unlock was *outside*. It failed to remove the outer unlock, however, leading to double unlock. Fix this. Fixes: 33483a6b88e4 ("mac80211: fix missing unlock on error in ieee80211_mark_sta_auth()") Signed-off-by: Andrei Otcheretianski Link: https://lore.kernel.org/r/20200221104719.cce4741cf6eb.I671567b185c8a4c2409377e483fd149ce590f56d@changeid [rewrite commit message to better explain what happened] Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index e041af2f021a..88d7a692a965 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2959,7 +2959,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, (auth_transaction == 2 && ifmgd->auth_data->expected_transaction == 2)) { if (!ieee80211_mark_sta_auth(sdata, bssid)) - goto out_err; + return; /* ignore frame -- wait for timeout */ } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 2) { sdata_info(sdata, "SAE peer confirmed\n"); @@ -2967,10 +2967,6 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, } cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); - return; - out_err: - mutex_unlock(&sdata->local->sta_mtx); - /* ignore frame -- wait for timeout */ } #define case_WLAN(type) \ -- cgit v1.2.3 From f66ee0410b1c3481ee75e5db9b34547b4d582465 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 11 Feb 2020 23:20:43 +0100 Subject: netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports In the case of huge hash:* types of sets, due to the single spinlock of a set the processing of the whole set under spinlock protection could take too long. There were four places where the whole hash table of the set was processed from bucket to bucket under holding the spinlock: - During resizing a set, the original set was locked to exclude kernel side add/del element operations (userspace add/del is excluded by the nfnetlink mutex). The original set is actually just read during the resize, so the spinlocking is replaced with rcu locking of regions. However, thus there can be parallel kernel side add/del of entries. In order not to loose those operations a backlog is added and replayed after the successful resize. - Garbage collection of timed out entries was also protected by the spinlock. In order not to lock too long, region locking is introduced and a single region is processed in one gc go. Also, the simple timer based gc running is replaced with a workqueue based solution. The internal book-keeping (number of elements, size of extensions) is moved to region level due to the region locking. - Adding elements: when the max number of the elements is reached, the gc was called to evict the timed out entries. The new approach is that the gc is called just for the matching region, assuming that if the region (proportionally) seems to be full, then the whole set does. We could scan the other regions to check every entry under rcu locking, but for huge sets it'd mean a slowdown at adding elements. - Listing the set header data: when the set was defined with timeout support, the garbage collector was called to clean up timed out entries to get the correct element numbers and set size values. Now the set is scanned to check non-timed out entries, without actually calling the gc for the whole set. Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe -> SOFTIRQ-unsafe lock order issues during working on the patch. Reported-by: syzbot+4b0e9d4ff3cf117837e5@syzkaller.appspotmail.com Reported-by: syzbot+c27b8d5010f45c666ed1@syzkaller.appspotmail.com Reported-by: syzbot+68a806795ac89df3aa1c@syzkaller.appspotmail.com Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7") Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 11 +- net/netfilter/ipset/ip_set_core.c | 34 +- net/netfilter/ipset/ip_set_hash_gen.h | 633 +++++++++++++++++++++++---------- 3 files changed, 472 insertions(+), 206 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 908d38dbcb91..5448c8b443db 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -121,6 +121,7 @@ struct ip_set_ext { u32 timeout; u8 packets_op; u8 bytes_op; + bool target; }; struct ip_set; @@ -187,6 +188,14 @@ struct ip_set_type_variant { /* Return true if "b" set is the same as "a" * according to the create set parameters */ bool (*same_set)(const struct ip_set *a, const struct ip_set *b); + /* Region-locking is used */ + bool region_lock; +}; + +struct ip_set_region { + spinlock_t lock; /* Region lock */ + size_t ext_size; /* Size of the dynamic extensions */ + u32 elements; /* Number of elements vs timeout */ }; /* The core set type structure */ @@ -501,7 +510,7 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, } #define IP_SET_INIT_KEXT(skb, opt, set) \ - { .bytes = (skb)->len, .packets = 1, \ + { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } #define IP_SET_INIT_UEXT(set) \ diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 69c107f9ba8d..8dd17589217d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -723,6 +723,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index) return set; } +static inline void +ip_set_lock(struct ip_set *set) +{ + if (!set->variant->region_lock) + spin_lock_bh(&set->lock); +} + +static inline void +ip_set_unlock(struct ip_set *set) +{ + if (!set->variant->region_lock) + spin_unlock_bh(&set->lock); +} + int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) @@ -744,9 +758,9 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); - spin_lock_bh(&set->lock); + ip_set_lock(set); set->variant->kadt(set, skb, par, IPSET_ADD, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); ret = 1; } else { /* --return-nomatch: invert matched element */ @@ -775,9 +789,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); return ret; } @@ -797,9 +811,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); return ret; } @@ -1264,9 +1278,9 @@ ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); - spin_lock_bh(&set->lock); + ip_set_lock(set); set->variant->flush(set); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); } static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, @@ -1713,9 +1727,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); retried = true; } while (ret == -EAGAIN && set->variant->resize && diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 7480ce55b5c8..71e93eac0831 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -7,13 +7,21 @@ #include #include #include +#include #include -#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) -#define ipset_dereference_protected(p, set) \ - __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock)) - -#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) +#define __ipset_dereference(p) \ + rcu_dereference_protected(p, 1) +#define ipset_dereference_nfnl(p) \ + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) +#define ipset_dereference_set(p, set) \ + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ + lockdep_is_held(&(set)->lock)) +#define ipset_dereference_bh_nfnl(p) \ + rcu_dereference_bh_check(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* Hashing which uses arrays to resolve clashing. The hash table is resized * (doubled) when searching becomes too long. @@ -72,11 +80,35 @@ struct hbucket { __aligned(__alignof__(u64)); }; +/* Region size for locking == 2^HTABLE_REGION_BITS */ +#define HTABLE_REGION_BITS 10 +#define ahash_numof_locks(htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? 1 \ + : jhash_size((htable_bits) - HTABLE_REGION_BITS)) +#define ahash_sizeof_regions(htable_bits) \ + (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) +#define ahash_region(n, htable_bits) \ + ((n) % ahash_numof_locks(htable_bits)) +#define ahash_bucket_start(h, htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? 0 \ + : (h) * jhash_size(HTABLE_REGION_BITS)) +#define ahash_bucket_end(h, htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits) \ + : ((h) + 1) * jhash_size(HTABLE_REGION_BITS)) + +struct htable_gc { + struct delayed_work dwork; + struct ip_set *set; /* Set the gc belongs to */ + u32 region; /* Last gc run position */ +}; + /* The hash table: the table size stored here in order to make resizing easy */ struct htable { atomic_t ref; /* References for resizing */ - atomic_t uref; /* References for dumping */ + atomic_t uref; /* References for dumping and gc */ u8 htable_bits; /* size of hash table == 2^htable_bits */ + u32 maxelem; /* Maxelem per region */ + struct ip_set_region *hregion; /* Region locks and ext sizes */ struct hbucket __rcu *bucket[0]; /* hashtable buckets */ }; @@ -162,6 +194,10 @@ htable_bits(u32 hashsize) #define NLEN 0 #endif /* IP_SET_HASH_WITH_NETS */ +#define SET_ELEM_EXPIRED(set, d) \ + (SET_WITH_TIMEOUT(set) && \ + ip_set_timeout_expired(ext_timeout(d, set))) + #endif /* _IP_SET_HASH_GEN_H */ #ifndef MTYPE @@ -205,10 +241,12 @@ htable_bits(u32 hashsize) #undef mtype_test_cidrs #undef mtype_test #undef mtype_uref -#undef mtype_expire #undef mtype_resize +#undef mtype_ext_size +#undef mtype_resize_ad #undef mtype_head #undef mtype_list +#undef mtype_gc_do #undef mtype_gc #undef mtype_gc_init #undef mtype_variant @@ -247,10 +285,12 @@ htable_bits(u32 hashsize) #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) #define mtype_test IPSET_TOKEN(MTYPE, _test) #define mtype_uref IPSET_TOKEN(MTYPE, _uref) -#define mtype_expire IPSET_TOKEN(MTYPE, _expire) #define mtype_resize IPSET_TOKEN(MTYPE, _resize) +#define mtype_ext_size IPSET_TOKEN(MTYPE, _ext_size) +#define mtype_resize_ad IPSET_TOKEN(MTYPE, _resize_ad) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_list IPSET_TOKEN(MTYPE, _list) +#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) @@ -275,8 +315,7 @@ htable_bits(u32 hashsize) /* The generic hash structure */ struct htype { struct htable __rcu *table; /* the hash table */ - struct timer_list gc; /* garbage collection when timeout enabled */ - struct ip_set *set; /* attached to this ip_set */ + struct htable_gc gc; /* gc workqueue */ u32 maxelem; /* max elements in the hash */ u32 initval; /* random jhash init value */ #ifdef IP_SET_HASH_WITH_MARKMASK @@ -288,21 +327,33 @@ struct htype { #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; /* netmask value for subnets to store */ #endif + struct list_head ad; /* Resize add|del backlist */ struct mtype_elem next; /* temporary storage for uadd */ #ifdef IP_SET_HASH_WITH_NETS struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */ #endif }; +/* ADD|DEL entries saved during resize */ +struct mtype_resize_ad { + struct list_head list; + enum ipset_adt ad; /* ADD|DEL element */ + struct mtype_elem d; /* Element value */ + struct ip_set_ext ext; /* Extensions for ADD */ + struct ip_set_ext mext; /* Target extensions for ADD */ + u32 flags; /* Flags for ADD */ +}; + #ifdef IP_SET_HASH_WITH_NETS /* Network cidr size book keeping when the hash stores different * sized networks. cidr == real cidr + 1 to support /0. */ static void -mtype_add_cidr(struct htype *h, u8 cidr, u8 n) +mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) { int i, j; + spin_lock_bh(&set->lock); /* Add in increasing prefix order, so larger cidr first */ for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) { if (j != -1) { @@ -311,7 +362,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n) j = i; } else if (h->nets[i].cidr[n] == cidr) { h->nets[CIDR_POS(cidr)].nets[n]++; - return; + goto unlock; } } if (j != -1) { @@ -320,24 +371,29 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n) } h->nets[i].cidr[n] = cidr; h->nets[CIDR_POS(cidr)].nets[n] = 1; +unlock: + spin_unlock_bh(&set->lock); } static void -mtype_del_cidr(struct htype *h, u8 cidr, u8 n) +mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) { u8 i, j, net_end = NLEN - 1; + spin_lock_bh(&set->lock); for (i = 0; i < NLEN; i++) { if (h->nets[i].cidr[n] != cidr) continue; h->nets[CIDR_POS(cidr)].nets[n]--; if (h->nets[CIDR_POS(cidr)].nets[n] > 0) - return; + goto unlock; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; h->nets[j].cidr[n] = 0; - return; + goto unlock; } +unlock: + spin_unlock_bh(&set->lock); } #endif @@ -345,7 +401,7 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 n) static size_t mtype_ahash_memsize(const struct htype *h, const struct htable *t) { - return sizeof(*h) + sizeof(*t); + return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits); } /* Get the ith element from the array block n */ @@ -369,24 +425,29 @@ mtype_flush(struct ip_set *set) struct htype *h = set->data; struct htable *t; struct hbucket *n; - u32 i; - - t = ipset_dereference_protected(h->table, set); - for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); - if (!n) - continue; - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set, n); - /* FIXME: use slab cache */ - rcu_assign_pointer(hbucket(t, i), NULL); - kfree_rcu(n, rcu); + u32 r, i; + + t = ipset_dereference_nfnl(h->table); + for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { + spin_lock_bh(&t->hregion[r].lock); + for (i = ahash_bucket_start(r, t->htable_bits); + i < ahash_bucket_end(r, t->htable_bits); i++) { + n = __ipset_dereference(hbucket(t, i)); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); + } + t->hregion[r].ext_size = 0; + t->hregion[r].elements = 0; + spin_unlock_bh(&t->hregion[r].lock); } #ifdef IP_SET_HASH_WITH_NETS memset(h->nets, 0, sizeof(h->nets)); #endif - set->elements = 0; - set->ext_size = 0; } /* Destroy the hashtable part of the set */ @@ -397,7 +458,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); + n = __ipset_dereference(hbucket(t, i)); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) @@ -406,6 +467,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) kfree(n); } + ip_set_free(t->hregion); ip_set_free(t); } @@ -414,28 +476,21 @@ static void mtype_destroy(struct ip_set *set) { struct htype *h = set->data; + struct list_head *l, *lt; if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&h->gc); + cancel_delayed_work_sync(&h->gc.dwork); - mtype_ahash_destroy(set, - __ipset_dereference_protected(h->table, 1), true); + mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); + list_for_each_safe(l, lt, &h->ad) { + list_del(l); + kfree(l); + } kfree(h); set->data = NULL; } -static void -mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) -{ - struct htype *h = set->data; - - timer_setup(&h->gc, gc, 0); - mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); - pr_debug("gc initialized, run in every %u\n", - IPSET_GC_PERIOD(set->timeout)); -} - static bool mtype_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -454,11 +509,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) a->extensions == b->extensions; } -/* Delete expired elements from the hashtable */ static void -mtype_expire(struct ip_set *set, struct htype *h) +mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) { - struct htable *t; struct hbucket *n, *tmp; struct mtype_elem *data; u32 i, j, d; @@ -466,10 +519,12 @@ mtype_expire(struct ip_set *set, struct htype *h) #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif + u8 htable_bits = t->htable_bits; - t = ipset_dereference_protected(h->table, set); - for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); + spin_lock_bh(&t->hregion[r].lock); + for (i = ahash_bucket_start(r, htable_bits); + i < ahash_bucket_end(r, htable_bits); i++) { + n = __ipset_dereference(hbucket(t, i)); if (!n) continue; for (j = 0, d = 0; j < n->pos; j++) { @@ -485,58 +540,100 @@ mtype_expire(struct ip_set *set, struct htype *h) smp_mb__after_atomic(); #ifdef IP_SET_HASH_WITH_NETS for (k = 0; k < IPSET_NET_COUNT; k++) - mtype_del_cidr(h, + mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(data->cidr, k)), k); #endif + t->hregion[r].elements--; ip_set_ext_destroy(set, data); - set->elements--; d++; } if (d >= AHASH_INIT_SIZE) { if (d >= n->size) { + t->hregion[r].ext_size -= + ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, i), NULL); kfree_rcu(n, rcu); continue; } tmp = kzalloc(sizeof(*tmp) + - (n->size - AHASH_INIT_SIZE) * dsize, - GFP_ATOMIC); + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); if (!tmp) - /* Still try to delete expired elements */ + /* Still try to delete expired elements. */ continue; tmp->size = n->size - AHASH_INIT_SIZE; for (j = 0, d = 0; j < n->pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); - memcpy(tmp->value + d * dsize, data, dsize); + memcpy(tmp->value + d * dsize, + data, dsize); set_bit(d, tmp->used); d++; } tmp->pos = d; - set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize); + t->hregion[r].ext_size -= + ext_size(AHASH_INIT_SIZE, dsize); rcu_assign_pointer(hbucket(t, i), tmp); kfree_rcu(n, rcu); } } + spin_unlock_bh(&t->hregion[r].lock); } static void -mtype_gc(struct timer_list *t) +mtype_gc(struct work_struct *work) { - struct htype *h = from_timer(h, t, gc); - struct ip_set *set = h->set; + struct htable_gc *gc; + struct ip_set *set; + struct htype *h; + struct htable *t; + u32 r, numof_locks; + unsigned int next_run; + + gc = container_of(work, struct htable_gc, dwork.work); + set = gc->set; + h = set->data; - pr_debug("called\n"); spin_lock_bh(&set->lock); - mtype_expire(set, h); + t = ipset_dereference_set(h->table, set); + atomic_inc(&t->uref); + numof_locks = ahash_numof_locks(t->htable_bits); + r = gc->region++; + if (r >= numof_locks) { + r = gc->region = 0; + } + next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks; + if (next_run < HZ/10) + next_run = HZ/10; spin_unlock_bh(&set->lock); - h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; - add_timer(&h->gc); + mtype_gc_do(set, h, t, r); + + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by expire: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + + queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run); + +} + +static void +mtype_gc_init(struct htable_gc *gc) +{ + INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc); + queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); } +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags); +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags); + /* Resize a hash: create a new hash table with doubling the hashsize * and inserting the elements to it. Repeat until we succeed or * fail due to memory pressures. @@ -547,7 +644,7 @@ mtype_resize(struct ip_set *set, bool retried) struct htype *h = set->data; struct htable *t, *orig; u8 htable_bits; - size_t extsize, dsize = set->dsize; + size_t dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; struct mtype_elem *tmp; @@ -555,7 +652,9 @@ mtype_resize(struct ip_set *set, bool retried) struct mtype_elem *data; struct mtype_elem *d; struct hbucket *n, *m; - u32 i, j, key; + struct list_head *l, *lt; + struct mtype_resize_ad *x; + u32 i, j, r, nr, key; int ret; #ifdef IP_SET_HASH_WITH_NETS @@ -563,10 +662,8 @@ mtype_resize(struct ip_set *set, bool retried) if (!tmp) return -ENOMEM; #endif - rcu_read_lock_bh(); - orig = rcu_dereference_bh_nfnl(h->table); + orig = ipset_dereference_bh_nfnl(h->table); htable_bits = orig->htable_bits; - rcu_read_unlock_bh(); retry: ret = 0; @@ -583,88 +680,124 @@ retry: ret = -ENOMEM; goto out; } + t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits)); + if (!t->hregion) { + kfree(t); + ret = -ENOMEM; + goto out; + } t->htable_bits = htable_bits; + t->maxelem = h->maxelem / ahash_numof_locks(htable_bits); + for (i = 0; i < ahash_numof_locks(htable_bits); i++) + spin_lock_init(&t->hregion[i].lock); - spin_lock_bh(&set->lock); - orig = __ipset_dereference_protected(h->table, 1); - /* There can't be another parallel resizing, but dumping is possible */ + /* There can't be another parallel resizing, + * but dumping, gc, kernel side add/del are possible + */ + orig = ipset_dereference_bh_nfnl(h->table); atomic_set(&orig->ref, 1); atomic_inc(&orig->uref); - extsize = 0; pr_debug("attempt to resize set %s from %u to %u, t %p\n", set->name, orig->htable_bits, htable_bits, orig); - for (i = 0; i < jhash_size(orig->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(orig, i), 1); - if (!n) - continue; - for (j = 0; j < n->pos; j++) { - if (!test_bit(j, n->used)) + for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) { + /* Expire may replace a hbucket with another one */ + rcu_read_lock_bh(); + for (i = ahash_bucket_start(r, orig->htable_bits); + i < ahash_bucket_end(r, orig->htable_bits); i++) { + n = __ipset_dereference(hbucket(orig, i)); + if (!n) continue; - data = ahash_data(n, j, dsize); + for (j = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + if (SET_ELEM_EXPIRED(set, data)) + continue; #ifdef IP_SET_HASH_WITH_NETS - /* We have readers running parallel with us, - * so the live data cannot be modified. - */ - flags = 0; - memcpy(tmp, data, dsize); - data = tmp; - mtype_data_reset_flags(data, &flags); + /* We have readers running parallel with us, + * so the live data cannot be modified. + */ + flags = 0; + memcpy(tmp, data, dsize); + data = tmp; + mtype_data_reset_flags(data, &flags); #endif - key = HKEY(data, h->initval, htable_bits); - m = __ipset_dereference_protected(hbucket(t, key), 1); - if (!m) { - m = kzalloc(sizeof(*m) + + key = HKEY(data, h->initval, htable_bits); + m = __ipset_dereference(hbucket(t, key)); + nr = ahash_region(key, htable_bits); + if (!m) { + m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, GFP_ATOMIC); - if (!m) { - ret = -ENOMEM; - goto cleanup; - } - m->size = AHASH_INIT_SIZE; - extsize += ext_size(AHASH_INIT_SIZE, dsize); - RCU_INIT_POINTER(hbucket(t, key), m); - } else if (m->pos >= m->size) { - struct hbucket *ht; - - if (m->size >= AHASH_MAX(h)) { - ret = -EAGAIN; - } else { - ht = kzalloc(sizeof(*ht) + + if (!m) { + ret = -ENOMEM; + goto cleanup; + } + m->size = AHASH_INIT_SIZE; + t->hregion[nr].ext_size += + ext_size(AHASH_INIT_SIZE, + dsize); + RCU_INIT_POINTER(hbucket(t, key), m); + } else if (m->pos >= m->size) { + struct hbucket *ht; + + if (m->size >= AHASH_MAX(h)) { + ret = -EAGAIN; + } else { + ht = kzalloc(sizeof(*ht) + (m->size + AHASH_INIT_SIZE) * dsize, GFP_ATOMIC); - if (!ht) - ret = -ENOMEM; + if (!ht) + ret = -ENOMEM; + } + if (ret < 0) + goto cleanup; + memcpy(ht, m, sizeof(struct hbucket) + + m->size * dsize); + ht->size = m->size + AHASH_INIT_SIZE; + t->hregion[nr].ext_size += + ext_size(AHASH_INIT_SIZE, + dsize); + kfree(m); + m = ht; + RCU_INIT_POINTER(hbucket(t, key), ht); } - if (ret < 0) - goto cleanup; - memcpy(ht, m, sizeof(struct hbucket) + - m->size * dsize); - ht->size = m->size + AHASH_INIT_SIZE; - extsize += ext_size(AHASH_INIT_SIZE, dsize); - kfree(m); - m = ht; - RCU_INIT_POINTER(hbucket(t, key), ht); - } - d = ahash_data(m, m->pos, dsize); - memcpy(d, data, dsize); - set_bit(m->pos++, m->used); + d = ahash_data(m, m->pos, dsize); + memcpy(d, data, dsize); + set_bit(m->pos++, m->used); + t->hregion[nr].elements++; #ifdef IP_SET_HASH_WITH_NETS - mtype_data_reset_flags(d, &flags); + mtype_data_reset_flags(d, &flags); #endif + } } + rcu_read_unlock_bh(); } - rcu_assign_pointer(h->table, t); - set->ext_size = extsize; - spin_unlock_bh(&set->lock); + /* There can't be any other writer. */ + rcu_assign_pointer(h->table, t); /* Give time to other readers of the set */ synchronize_rcu(); pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, orig->htable_bits, orig, t->htable_bits, t); - /* If there's nobody else dumping the table, destroy it */ + /* Add/delete elements processed by the SET target during resize. + * Kernel-side add cannot trigger a resize and userspace actions + * are serialized by the mutex. + */ + list_for_each_safe(l, lt, &h->ad) { + x = list_entry(l, struct mtype_resize_ad, list); + if (x->ad == IPSET_ADD) { + mtype_add(set, &x->d, &x->ext, &x->mext, x->flags); + } else { + mtype_del(set, &x->d, NULL, NULL, 0); + } + list_del(l); + kfree(l); + } + /* If there's nobody else using the table, destroy it */ if (atomic_dec_and_test(&orig->uref)) { pr_debug("Table destroy by resize %p\n", orig); mtype_ahash_destroy(set, orig, false); @@ -677,15 +810,44 @@ out: return ret; cleanup: + rcu_read_unlock_bh(); atomic_set(&orig->ref, 0); atomic_dec(&orig->uref); - spin_unlock_bh(&set->lock); mtype_ahash_destroy(set, t, false); if (ret == -EAGAIN) goto retry; goto out; } +/* Get the current number of elements and ext_size in the set */ +static void +mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) +{ + struct htype *h = set->data; + const struct htable *t; + u32 i, j, r; + struct hbucket *n; + struct mtype_elem *data; + + t = rcu_dereference_bh(h->table); + for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { + for (i = ahash_bucket_start(r, t->htable_bits); + i < ahash_bucket_end(r, t->htable_bits); i++) { + n = rcu_dereference_bh(hbucket(t, i)); + if (!n) + continue; + for (j = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, set->dsize); + if (!SET_ELEM_EXPIRED(set, data)) + (*elements)++; + } + } + *ext_size += t->hregion[r].ext_size; + } +} + /* Add an element to a hash and update the internal counters when succeeded, * otherwise report the proper error code. */ @@ -698,32 +860,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n, *old = ERR_PTR(-ENOENT); - int i, j = -1; + int i, j = -1, ret; bool flag_exist = flags & IPSET_FLAG_EXIST; bool deleted = false, forceadd = false, reuse = false; - u32 key, multi = 0; + u32 r, key, multi = 0, elements, maxelem; - if (set->elements >= h->maxelem) { - if (SET_WITH_TIMEOUT(set)) - /* FIXME: when set is full, we slow down here */ - mtype_expire(set, h); - if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set)) + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + r = ahash_region(key, t->htable_bits); + atomic_inc(&t->uref); + elements = t->hregion[r].elements; + maxelem = t->maxelem; + if (elements >= maxelem) { + u32 e; + if (SET_WITH_TIMEOUT(set)) { + rcu_read_unlock_bh(); + mtype_gc_do(set, h, t, r); + rcu_read_lock_bh(); + } + maxelem = h->maxelem; + elements = 0; + for (e = 0; e < ahash_numof_locks(t->htable_bits); e++) + elements += t->hregion[e].elements; + if (elements >= maxelem && SET_WITH_FORCEADD(set)) forceadd = true; } + rcu_read_unlock_bh(); - t = ipset_dereference_protected(h->table, set); - key = HKEY(value, h->initval, t->htable_bits); - n = __ipset_dereference_protected(hbucket(t, key), 1); + spin_lock_bh(&t->hregion[r].lock); + n = rcu_dereference_bh(hbucket(t, key)); if (!n) { - if (forceadd || set->elements >= h->maxelem) + if (forceadd || elements >= maxelem) goto set_full; old = NULL; n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, GFP_ATOMIC); - if (!n) - return -ENOMEM; + if (!n) { + ret = -ENOMEM; + goto unlock; + } n->size = AHASH_INIT_SIZE; - set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); + t->hregion[r].ext_size += + ext_size(AHASH_INIT_SIZE, set->dsize); goto copy_elem; } for (i = 0; i < n->pos; i++) { @@ -737,19 +916,16 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, } data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi)) { - if (flag_exist || - (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set)))) { + if (flag_exist || SET_ELEM_EXPIRED(set, data)) { /* Just the extensions could be overwritten */ j = i; goto overwrite_extensions; } - return -IPSET_ERR_EXIST; + ret = -IPSET_ERR_EXIST; + goto unlock; } /* Reuse first timed out entry */ - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set)) && - j == -1) { + if (SET_ELEM_EXPIRED(set, data) && j == -1) { j = i; reuse = true; } @@ -759,16 +935,16 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_del_cidr(h, + mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(data->cidr, i)), i); #endif ip_set_ext_destroy(set, data); - set->elements--; + t->hregion[r].elements--; } goto copy_data; } - if (set->elements >= h->maxelem) + if (elements >= maxelem) goto set_full; /* Create a new slot */ if (n->pos >= n->size) { @@ -776,28 +952,32 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (n->size >= AHASH_MAX(h)) { /* Trigger rehashing */ mtype_data_next(&h->next, d); - return -EAGAIN; + ret = -EAGAIN; + goto resize; } old = n; n = kzalloc(sizeof(*n) + (old->size + AHASH_INIT_SIZE) * set->dsize, GFP_ATOMIC); - if (!n) - return -ENOMEM; + if (!n) { + ret = -ENOMEM; + goto unlock; + } memcpy(n, old, sizeof(struct hbucket) + old->size * set->dsize); n->size = old->size + AHASH_INIT_SIZE; - set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); + t->hregion[r].ext_size += + ext_size(AHASH_INIT_SIZE, set->dsize); } copy_elem: j = n->pos++; data = ahash_data(n, j, set->dsize); copy_data: - set->elements++; + t->hregion[r].elements++; #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); + mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); #endif memcpy(data, d, sizeof(struct mtype_elem)); overwrite_extensions: @@ -820,13 +1000,41 @@ overwrite_extensions: if (old) kfree_rcu(old, rcu); } + ret = 0; +resize: + spin_unlock_bh(&t->hregion[r].lock); + if (atomic_read(&t->ref) && ext->target) { + /* Resize is in process and kernel side add, save values */ + struct mtype_resize_ad *x; + + x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC); + if (!x) + /* Don't bother */ + goto out; + x->ad = IPSET_ADD; + memcpy(&x->d, value, sizeof(struct mtype_elem)); + memcpy(&x->ext, ext, sizeof(struct ip_set_ext)); + memcpy(&x->mext, mext, sizeof(struct ip_set_ext)); + x->flags = flags; + spin_lock_bh(&set->lock); + list_add_tail(&x->list, &h->ad); + spin_unlock_bh(&set->lock); + } + goto out; - return 0; set_full: if (net_ratelimit()) pr_warn("Set %s is full, maxelem %u reached\n", - set->name, h->maxelem); - return -IPSET_ERR_HASH_FULL; + set->name, maxelem); + ret = -IPSET_ERR_HASH_FULL; +unlock: + spin_unlock_bh(&t->hregion[r].lock); +out: + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by add: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + return ret; } /* Delete an element from the hash and free up space if possible. @@ -840,13 +1048,23 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n; - int i, j, k, ret = -IPSET_ERR_EXIST; + struct mtype_resize_ad *x = NULL; + int i, j, k, r, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; size_t dsize = set->dsize; - t = ipset_dereference_protected(h->table, set); + /* Userspace add and resize is excluded by the mutex. + * Kernespace add does not trigger resize. + */ + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - n = __ipset_dereference_protected(hbucket(t, key), 1); + r = ahash_region(key, t->htable_bits); + atomic_inc(&t->uref); + rcu_read_unlock_bh(); + + spin_lock_bh(&t->hregion[r].lock); + n = rcu_dereference_bh(hbucket(t, key)); if (!n) goto out; for (i = 0, k = 0; i < n->pos; i++) { @@ -857,8 +1075,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, data = ahash_data(n, i, dsize); if (!mtype_data_equal(data, d, &multi)) continue; - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set))) + if (SET_ELEM_EXPIRED(set, data)) goto out; ret = 0; @@ -866,20 +1083,33 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, smp_mb__after_atomic(); if (i + 1 == n->pos) n->pos--; - set->elements--; + t->hregion[r].elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) - mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), - j); + mtype_del_cidr(set, h, + NCIDR_PUT(DCIDR_GET(d->cidr, j)), j); #endif ip_set_ext_destroy(set, data); + if (atomic_read(&t->ref) && ext->target) { + /* Resize is in process and kernel side del, + * save values + */ + x = kzalloc(sizeof(struct mtype_resize_ad), + GFP_ATOMIC); + if (x) { + x->ad = IPSET_DEL; + memcpy(&x->d, value, + sizeof(struct mtype_elem)); + x->flags = flags; + } + } for (; i < n->pos; i++) { if (!test_bit(i, n->used)) k++; } if (n->pos == 0 && k == 0) { - set->ext_size -= ext_size(n->size, dsize); + t->hregion[r].ext_size -= ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, key), NULL); kfree_rcu(n, rcu); } else if (k >= AHASH_INIT_SIZE) { @@ -898,7 +1128,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, k++; } tmp->pos = k; - set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize); + t->hregion[r].ext_size -= + ext_size(AHASH_INIT_SIZE, dsize); rcu_assign_pointer(hbucket(t, key), tmp); kfree_rcu(n, rcu); } @@ -906,6 +1137,16 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, } out: + spin_unlock_bh(&t->hregion[r].lock); + if (x) { + spin_lock_bh(&set->lock); + list_add(&x->list, &h->ad); + spin_unlock_bh(&set->lock); + } + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by del: %p\n", t); + mtype_ahash_destroy(set, t, false); + } return ret; } @@ -991,6 +1232,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, ret = 0; u32 key, multi = 0; + rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); #ifdef IP_SET_HASH_WITH_NETS /* If we test an IP address and not a network address, @@ -1022,6 +1264,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, goto out; } out: + rcu_read_unlock_bh(); return ret; } @@ -1033,23 +1276,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) const struct htable *t; struct nlattr *nested; size_t memsize; + u32 elements = 0; + size_t ext_size = 0; u8 htable_bits; - /* If any members have expired, set->elements will be wrong - * mytype_expire function will update it with the right count. - * we do not hold set->lock here, so grab it first. - * set->elements can still be incorrect in the case of a huge set, - * because elements might time out during the listing. - */ - if (SET_WITH_TIMEOUT(set)) { - spin_lock_bh(&set->lock); - mtype_expire(set, h); - spin_unlock_bh(&set->lock); - } - rcu_read_lock_bh(); - t = rcu_dereference_bh_nfnl(h->table); - memsize = mtype_ahash_memsize(h, t) + set->ext_size; + t = rcu_dereference_bh(h->table); + mtype_ext_size(set, &elements, &ext_size); + memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size; htable_bits = t->htable_bits; rcu_read_unlock_bh(); @@ -1071,7 +1305,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) #endif if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || - nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) + nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; @@ -1091,15 +1325,15 @@ mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start) if (start) { rcu_read_lock_bh(); - t = rcu_dereference_bh_nfnl(h->table); + t = ipset_dereference_bh_nfnl(h->table); atomic_inc(&t->uref); cb->args[IPSET_CB_PRIVATE] = (unsigned long)t; rcu_read_unlock_bh(); } else if (cb->args[IPSET_CB_PRIVATE]) { t = (struct htable *)cb->args[IPSET_CB_PRIVATE]; if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { - /* Resizing didn't destroy the hash table */ - pr_debug("Table destroy by dump: %p\n", t); + pr_debug("Table destroy after resize " + " by dump: %p\n", t); mtype_ahash_destroy(set, t, false); } cb->args[IPSET_CB_PRIVATE] = 0; @@ -1141,8 +1375,7 @@ mtype_list(const struct ip_set *set, if (!test_bit(i, n->used)) continue; e = ahash_data(n, i, set->dsize); - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + if (SET_ELEM_EXPIRED(set, e)) continue; pr_debug("list hash %lu hbucket %p i %u, data %p\n", cb->args[IPSET_CB_ARG0], n, i, e); @@ -1208,6 +1441,7 @@ static const struct ip_set_type_variant mtype_variant = { .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, + .region_lock = true, }; #ifdef IP_SET_EMIT_CREATE @@ -1226,6 +1460,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, size_t hsize; struct htype *h; struct htable *t; + u32 i; pr_debug("Create set %s with family %s\n", set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); @@ -1294,6 +1529,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, kfree(h); return -ENOMEM; } + t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits)); + if (!t->hregion) { + kfree(t); + kfree(h); + return -ENOMEM; + } + h->gc.set = set; + for (i = 0; i < ahash_numof_locks(hbits); i++) + spin_lock_init(&t->hregion[i].lock); h->maxelem = maxelem; #ifdef IP_SET_HASH_WITH_NETMASK h->netmask = netmask; @@ -1304,9 +1548,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, get_random_bytes(&h->initval, sizeof(h->initval)); t->htable_bits = hbits; + t->maxelem = h->maxelem / ahash_numof_locks(hbits); RCU_INIT_POINTER(h->table, t); - h->set = set; + INIT_LIST_HEAD(&h->ad); set->data = h; #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) { @@ -1329,12 +1574,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) #endif - IPSET_TOKEN(HTYPE, 4_gc_init)(set, - IPSET_TOKEN(HTYPE, 4_gc)); + IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc); #ifndef IP_SET_PROTO_UNDEF else - IPSET_TOKEN(HTYPE, 6_gc_init)(set, - IPSET_TOKEN(HTYPE, 6_gc)); + IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc); #endif } pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", -- cgit v1.2.3 From 8af1c6fbd9239877998c7f5a591cb2c88d41fb66 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 22 Feb 2020 12:01:43 +0100 Subject: netfilter: ipset: Fix forceadd evaluation path When the forceadd option is enabled, the hash:* types should find and replace the first entry in the bucket with the new one if there are no reuseable (deleted or timed out) entries. However, the position index was just not set to zero and remained the invalid -1 if there were no reuseable entries. Reported-by: syzbot+6a86565c74ebe30aea18@syzkaller.appspotmail.com Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7") Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 71e93eac0831..e52d7b7597a0 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -931,6 +931,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, } } if (reuse || forceadd) { + if (j == -1) + j = 0; data = ahash_data(n, j, set->dsize); if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS -- cgit v1.2.3 From 3e72dfdf8227b052393f71d820ec7599909dddc2 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 21 Feb 2020 12:28:38 +0100 Subject: ipv4: ensure rcu_read_lock() in cipso_v4_error() Similarly to commit c543cb4a5f07 ("ipv4: ensure rcu_read_lock() in ipv4_link_failure()"), __ip_options_compile() must be called under rcu protection. Fixes: 3da1ed7ac398 ("net: avoid use IPCB in cipso_v4_error") Suggested-by: Guillaume Nault Signed-off-by: Matteo Croce Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 376882215919..0bd10a1f477f 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1724,6 +1724,7 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; + int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; @@ -1735,7 +1736,11 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) memset(opt, 0, sizeof(struct ip_options)); opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); - if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL)) + rcu_read_lock(); + res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); + rcu_read_unlock(); + + if (res) return; if (gateway) -- cgit v1.2.3 From 39f3b41aa7cae917f928ef9f31d09da28188e5ed Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Feb 2020 19:42:13 +0100 Subject: net: genetlink: return the error code when attribute parsing fails. Currently if attribute parsing fails and the genl family does not support parallel operation, the error code returned by __nlmsg_parse() is discarded by genl_family_rcv_msg_attrs_parse(). Be sure to report the error for all genl families. Fixes: c10e6cf85e7d ("net: genetlink: push attrbuf allocation and parsing to a separate function") Fixes: ab5b526da048 ("net: genetlink: always allocate separate attrs for dumpit ops") Signed-off-by: Paolo Abeni Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 0522b2b1fd95..9f357aa22b94 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -497,8 +497,9 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, family->policy, validate, extack); - if (err && parallel) { - kfree(attrbuf); + if (err) { + if (parallel) + kfree(attrbuf); return ERR_PTR(err); } return attrbuf; -- cgit v1.2.3 From dad8cea7add96a353fa1898b5ccefbb72da66f29 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 22 Feb 2020 11:21:15 -0500 Subject: tcp: fix TFO SYNACK undo to avoid double-timestamp-undo In a rare corner case the new logic for undo of SYNACK RTO could result in triggering the warning in tcp_fastretrans_alert() that says: WARN_ON(tp->retrans_out != 0); The warning looked like: WARNING: CPU: 1 PID: 1 at net/ipv4/tcp_input.c:2818 tcp_ack+0x13e0/0x3270 The sequence that tickles this bug is: - Fast Open server receives TFO SYN with data, sends SYNACK - (client receives SYNACK and sends ACK, but ACK is lost) - server app sends some data packets - (N of the first data packets are lost) - server receives client ACK that has a TS ECR matching first SYNACK, and also SACKs suggesting the first N data packets were lost - server performs TS undo of SYNACK RTO, then immediately enters recovery - buggy behavior then performed a *second* undo that caused the connection to be in CA_Open with retrans_out != 0 Basically, the incoming ACK packet with SACK blocks causes us to first undo the cwnd reduction from the SYNACK RTO, but then immediately enters fast recovery, which then makes us eligible for undo again. And then tcp_rcv_synrecv_state_fastopen() accidentally performs an undo using a "mash-up" of state from two different loss recovery phases: it uses the timestamp info from the ACK of the original SYNACK, and the undo_marker from the fast recovery. This fix refines the logic to only invoke the tcp_try_undo_loss() inside tcp_rcv_synrecv_state_fastopen() if the connection is still in CA_Loss. If peer SACKs triggered fast recovery, then tcp_rcv_synrecv_state_fastopen() can't safely undo. Fixes: 794200d66273 ("tcp: undo cwnd on Fast Open spurious SYNACK retransmit") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 316ebdf8151d..6b6b57000dad 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6124,7 +6124,11 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { struct request_sock *req; - tcp_try_undo_loss(sk, false); + /* If we are still handling the SYNACK RTO, see if timestamp ECR allows + * undo. If peer SACKs triggered fast recovery, we can't undo here. + */ + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) + tcp_try_undo_loss(sk, false); /* Reset rtx states to prevent spurious retransmits_timed_out() */ tcp_sk(sk)->retrans_stamp = 0; -- cgit v1.2.3 From 6132c1d9033d158bd0464e90bc46544fbe0bd6bc Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sun, 23 Feb 2020 16:52:33 +0530 Subject: net: core: devlink.c: Hold devlink->lock from the beginning of devlink_dpipe_table_register() devlink_dpipe_table_find() should be called under either rcu_read_lock() or devlink->lock. devlink_dpipe_table_register() calls devlink_dpipe_table_find() without holding the lock and acquires it later. Therefore hold the devlink->lock from the beginning of devlink_dpipe_table_register(). Suggested-by: Jiri Pirko Signed-off-by: Madhuparna Bhowmik Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 549ee56b7a21..8d0b558be942 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6878,26 +6878,33 @@ int devlink_dpipe_table_register(struct devlink *devlink, void *priv, bool counter_control_extern) { struct devlink_dpipe_table *table; - - if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) - return -EEXIST; + int err = 0; if (WARN_ON(!table_ops->size_get)) return -EINVAL; + mutex_lock(&devlink->lock); + + if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) { + err = -EEXIST; + goto unlock; + } + table = kzalloc(sizeof(*table), GFP_KERNEL); - if (!table) - return -ENOMEM; + if (!table) { + err = -ENOMEM; + goto unlock; + } table->name = table_name; table->table_ops = table_ops; table->priv = priv; table->counter_control_extern = counter_control_extern; - mutex_lock(&devlink->lock); list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); +unlock: mutex_unlock(&devlink->lock); - return 0; + return err; } EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); -- cgit v1.2.3 From e3ae39edbce6dc933fb1393490d1b5d76d3edb90 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Feb 2020 09:38:15 +0100 Subject: nl80211: explicitly include if_vlan.h We use that here, and do seem to get it through some recursive include, but better include it explicitly. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200224093814.1b9c258fec67.I45ac150d4e11c72eb263abec9f1f0c7add9bef2b@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 46be40e19e7f..5b19e9fac4aa 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 253216ffb2a002a682c6f68bd3adff5b98b71de8 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sun, 23 Feb 2020 20:03:02 +0530 Subject: mac80211: rx: avoid RCU list traversal under mutex local->sta_mtx is held in __ieee80211_check_fast_rx_iface(). No need to use list_for_each_entry_rcu() as it also requires a cond argument to avoid false lockdep warnings when not used in RCU read-side section (with CONFIG_PROVE_RCU_LIST). Therefore use list_for_each_entry(); Signed-off-by: Madhuparna Bhowmik Link: https://lore.kernel.org/r/20200223143302.15390-1-madhuparnabhowmik10@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 0e05ff037672..0ba98ad9bc85 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4114,7 +4114,7 @@ void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata) lockdep_assert_held(&local->sta_mtx); - list_for_each_entry_rcu(sta, &local->sta_list, list) { + list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata && (!sta->sdata->bss || sta->sdata->bss != sdata->bss)) continue; -- cgit v1.2.3 From 823d81b0fa2cd83a640734e74caee338b5d3c093 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 24 Feb 2020 18:46:22 +0200 Subject: net: bridge: fix stale eth hdr pointer in br_dev_xmit In br_dev_xmit() we perform vlan filtering in br_allowed_ingress() but if the packet has the vlan header inside (e.g. bridge with disabled tx-vlan-offload) then the vlan filtering code will use skb_vlan_untag() to extract the vid before filtering which in turn calls pskb_may_pull() and we may end up with a stale eth pointer. Moreover the cached eth header pointer will generally be wrong after that operation. Remove the eth header caching and just use eth_hdr() directly, the compiler does the right thing and calculates it only once so we don't lose anything. Fixes: 057658cb33fb ("bridge: suppress arp pkts on BR_NEIGH_SUPPRESS ports") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_device.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index dc3d2c1dd9d5..0e3dbc5f3c34 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -34,7 +34,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) const struct nf_br_ops *nf_ops; u8 state = BR_STATE_FORWARDING; const unsigned char *dest; - struct ethhdr *eth; u16 vid = 0; rcu_read_lock(); @@ -54,15 +53,14 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) BR_INPUT_SKB_CB(skb)->frag_max_size = 0; skb_reset_mac_header(skb); - eth = eth_hdr(skb); skb_pull(skb, ETH_HLEN); if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state)) goto out; if (IS_ENABLED(CONFIG_INET) && - (eth->h_proto == htons(ETH_P_ARP) || - eth->h_proto == htons(ETH_P_RARP)) && + (eth_hdr(skb)->h_proto == htons(ETH_P_ARP) || + eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { br_do_proxy_suppress_arp(skb, br, vid, NULL); } else if (IS_ENABLED(CONFIG_IPV6) && -- cgit v1.2.3 From 212d58c106fd0f2704664be2bb173e14cb4e86d3 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 21 Feb 2020 03:04:21 +0100 Subject: nft_set_pipapo: Actually fetch key data in nft_pipapo_remove() Phil reports that adding elements, flushing and re-adding them right away: nft add table t '{ set s { type ipv4_addr . inet_service; flags interval; }; }' nft add element t s '{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' nft flush set t s nft add element t s '{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' triggers, almost reliably, a crash like this one: [ 71.319848] general protection fault, probably for non-canonical address 0x6f6b6e696c2e756e: 0000 [#1] PREEMPT SMP PTI [ 71.321540] CPU: 3 PID: 1201 Comm: kworker/3:2 Not tainted 5.6.0-rc1-00377-g2bb07f4e1d861 #192 [ 71.322746] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190711_202441-buildvm-armv7-10.arm.fedoraproject.org-2.fc31 04/01/2014 [ 71.324430] Workqueue: events nf_tables_trans_destroy_work [nf_tables] [ 71.325387] RIP: 0010:nft_set_elem_destroy+0xa5/0x110 [nf_tables] [ 71.326164] Code: 89 d4 84 c0 74 0e 8b 77 44 0f b6 f8 48 01 df e8 41 ff ff ff 45 84 e4 74 36 44 0f b6 63 08 45 84 e4 74 2c 49 01 dc 49 8b 04 24 <48> 8b 40 38 48 85 c0 74 4f 48 89 e7 4c 8b [ 71.328423] RSP: 0018:ffffc9000226fd90 EFLAGS: 00010282 [ 71.329225] RAX: 6f6b6e696c2e756e RBX: ffff88813ab79f60 RCX: ffff88813931b5a0 [ 71.330365] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff88813ab79f9a [ 71.331473] RBP: ffff88813ab79f60 R08: 0000000000000008 R09: 0000000000000000 [ 71.332627] R10: 000000000000021c R11: 0000000000000000 R12: ffff88813ab79fc2 [ 71.333615] R13: ffff88813b3adf50 R14: dead000000000100 R15: ffff88813931b8a0 [ 71.334596] FS: 0000000000000000(0000) GS:ffff88813bd80000(0000) knlGS:0000000000000000 [ 71.335780] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 71.336577] CR2: 000055ac683710f0 CR3: 000000013a222003 CR4: 0000000000360ee0 [ 71.337533] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 71.338557] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 71.339718] Call Trace: [ 71.340093] nft_pipapo_destroy+0x7a/0x170 [nf_tables_set] [ 71.340973] nft_set_destroy+0x20/0x50 [nf_tables] [ 71.341879] nf_tables_trans_destroy_work+0x246/0x260 [nf_tables] [ 71.342916] process_one_work+0x1d5/0x3c0 [ 71.343601] worker_thread+0x4a/0x3c0 [ 71.344229] kthread+0xfb/0x130 [ 71.344780] ? process_one_work+0x3c0/0x3c0 [ 71.345477] ? kthread_park+0x90/0x90 [ 71.346129] ret_from_fork+0x35/0x40 [ 71.346748] Modules linked in: nf_tables_set nf_tables nfnetlink 8021q [last unloaded: nfnetlink] [ 71.348153] ---[ end trace 2eaa8149ca759bcc ]--- [ 71.349066] RIP: 0010:nft_set_elem_destroy+0xa5/0x110 [nf_tables] [ 71.350016] Code: 89 d4 84 c0 74 0e 8b 77 44 0f b6 f8 48 01 df e8 41 ff ff ff 45 84 e4 74 36 44 0f b6 63 08 45 84 e4 74 2c 49 01 dc 49 8b 04 24 <48> 8b 40 38 48 85 c0 74 4f 48 89 e7 4c 8b [ 71.350017] RSP: 0018:ffffc9000226fd90 EFLAGS: 00010282 [ 71.350019] RAX: 6f6b6e696c2e756e RBX: ffff88813ab79f60 RCX: ffff88813931b5a0 [ 71.350019] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff88813ab79f9a [ 71.350020] RBP: ffff88813ab79f60 R08: 0000000000000008 R09: 0000000000000000 [ 71.350021] R10: 000000000000021c R11: 0000000000000000 R12: ffff88813ab79fc2 [ 71.350022] R13: ffff88813b3adf50 R14: dead000000000100 R15: ffff88813931b8a0 [ 71.350025] FS: 0000000000000000(0000) GS:ffff88813bd80000(0000) knlGS:0000000000000000 [ 71.350026] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 71.350027] CR2: 000055ac683710f0 CR3: 000000013a222003 CR4: 0000000000360ee0 [ 71.350028] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 71.350028] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 71.350030] Kernel panic - not syncing: Fatal exception [ 71.350412] Kernel Offset: disabled [ 71.365922] ---[ end Kernel panic - not syncing: Fatal exception ]--- which is caused by dangling elements that have been deactivated, but never removed. On a flush operation, nft_pipapo_walk() walks through all the elements in the mapping table, which are then deactivated by nft_flush_set(), one by one, and added to the commit list for removal. Element data is then freed. On transaction commit, nft_pipapo_remove() is called, and failed to remove these elements, leading to the stale references in the mapping. The first symptom of this, revealed by KASan, is a one-byte use-after-free in subsequent calls to nft_pipapo_walk(), which is usually not enough to trigger a panic. When stale elements are used more heavily, though, such as double-free via nft_pipapo_destroy() as in Phil's case, the problem becomes more noticeable. The issue comes from that fact that, on a flush operation, nft_pipapo_remove() won't get the actual key data via elem->key, elements to be deleted upon commit won't be found by the lookup via pipapo_get(), and removal will be skipped. Key data should be fetched via nft_set_ext_key(), instead. Reported-by: Phil Sutter Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index feac8553f6d9..4fc0c924ed5d 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1766,11 +1766,13 @@ static bool pipapo_match_field(struct nft_pipapo_field *f, static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - const u8 *data = (const u8 *)elem->key.val.data; struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; + struct nft_pipapo_elem *e = elem->priv; int rules_f0, first_rule = 0; - struct nft_pipapo_elem *e; + const u8 *data; + + data = (const u8 *)nft_set_ext_key(&e->ext); e = pipapo_get(net, set, data, 0); if (IS_ERR(e)) -- cgit v1.2.3 From 6e11d1578fba8d09d03a286740ffcf336d53928c Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Mon, 24 Feb 2020 10:56:00 -0800 Subject: net: Fix Tx hash bound checking Fixes the lower and upper bounds when there are multiple TCs and traffic is on the the same TC on the same device. The lower bound is represented by 'qoffset' and the upper limit for hash value is 'qcount + qoffset'. This gives a clean Rx to Tx queue mapping when there are multiple TCs, as the queue indices for upper TCs will be offset by 'qoffset'. v2: Fixed commit description based on comments. Fixes: 1b837d489e06 ("net: Revoke export for __skb_tx_hash, update it to just be static skb_tx_hash") Fixes: eadec877ce9c ("net: Add support for subordinate traffic classes to netdev_pick_tx") Signed-off-by: Amritha Nambiar Reviewed-by: Alexander Duyck Reviewed-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index e10bd680dc03..c6c985fe7b1b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3076,6 +3076,8 @@ static u16 skb_tx_hash(const struct net_device *dev, if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); + if (hash >= qoffset) + hash -= qoffset; while (unlikely(hash >= qcount)) hash -= qcount; return hash + qoffset; -- cgit v1.2.3 From e34f1753eebc428c312527662eb1b529cf260240 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Mon, 24 Feb 2020 20:42:12 +0100 Subject: ethtool: limit bitset size Syzbot reported that ethnl_compact_sanity_checks() can be tricked into reading past the end of ETHTOOL_A_BITSET_VALUE and ETHTOOL_A_BITSET_MASK attributes and even the message by passing a value between (u32)(-31) and (u32)(-1) as ETHTOOL_A_BITSET_SIZE. The problem is that DIV_ROUND_UP(attr_nbits, 32) is 0 for such values so that zero length ETHTOOL_A_BITSET_VALUE will pass the length check but ethnl_bitmap32_not_zero() check would try to access up to 512 MB of attribute "payload". Prevent this overflow byt limiting the bitset size. Technically, compact bitset format would allow bitset sizes up to almost 2^18 (so that the nest size does not exceed U16_MAX) but bitsets used by ethtool are much shorter. S16_MAX, the largest value which can be directly used as an upper limit in policy, should be a reasonable compromise. Fixes: 10b518d4e6dd ("ethtool: netlink bitset handling") Reported-by: syzbot+7fd4ed5b4234ab1fdccd@syzkaller.appspotmail.com Reported-by: syzbot+709b7a64d57978247e44@syzkaller.appspotmail.com Reported-by: syzbot+983cb8fb2d17a7af549d@syzkaller.appspotmail.com Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/bitset.c | 3 ++- net/ethtool/bitset.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index 8977fe1f3946..ef9197541cb3 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -305,7 +305,8 @@ nla_put_failure: static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = { [ETHTOOL_A_BITSET_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, - [ETHTOOL_A_BITSET_SIZE] = { .type = NLA_U32 }, + [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32, + ETHNL_MAX_BITSET_SIZE), [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h index b8247e34109d..b849f9d19676 100644 --- a/net/ethtool/bitset.h +++ b/net/ethtool/bitset.h @@ -3,6 +3,8 @@ #ifndef _NET_ETHTOOL_BITSET_H #define _NET_ETHTOOL_BITSET_H +#define ETHNL_MAX_BITSET_SIZE S16_MAX + typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN]; int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact); -- cgit v1.2.3 From 99b79c3900d4627672c85d9f344b5b0f06bc2a4d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 12 Feb 2020 22:53:52 -0800 Subject: netfilter: xt_hashlimit: unregister proc file before releasing mutex Before releasing the global mutex, we only unlink the hashtable from the hash list, its proc file is still not unregistered at this point. So syzbot could trigger a race condition where a parallel htable_create() could register the same file immediately after the mutex is released. Move htable_remove_proc_entry() back to mutex protection to fix this. And, fold htable_destroy() into htable_put() to make the code slightly easier to understand. Reported-and-tested-by: syzbot+d195fd3b9a364ddd6731@syzkaller.appspotmail.com Fixes: c4a3922d2d20 ("netfilter: xt_hashlimit: reduce hashlimit_mutex scope for htable_put()") Signed-off-by: Cong Wang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 7a2c4b8408c4..8c835ad63729 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -402,15 +402,6 @@ static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo) remove_proc_entry(hinfo->name, parent); } -static void htable_destroy(struct xt_hashlimit_htable *hinfo) -{ - cancel_delayed_work_sync(&hinfo->gc_work); - htable_remove_proc_entry(hinfo); - htable_selective_cleanup(hinfo, true); - kfree(hinfo->name); - vfree(hinfo); -} - static struct xt_hashlimit_htable *htable_find_get(struct net *net, const char *name, u_int8_t family) @@ -432,8 +423,13 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) { if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) { hlist_del(&hinfo->node); + htable_remove_proc_entry(hinfo); mutex_unlock(&hashlimit_mutex); - htable_destroy(hinfo); + + cancel_delayed_work_sync(&hinfo->gc_work); + htable_selective_cleanup(hinfo, true); + kfree(hinfo->name); + vfree(hinfo); } } -- cgit v1.2.3 From 2eb51c75dcb354f8aef03d7648318b24630632e1 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Tue, 25 Feb 2020 17:57:45 +0530 Subject: net: core: devlink.c: Use built-in RCU list checking list_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled. The devlink->lock is held when devlink_dpipe_table_find() is called in non RCU read side section. Therefore, pass struct devlink to devlink_dpipe_table_find() for lockdep checking. Signed-off-by: Madhuparna Bhowmik Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 8d0b558be942..5e220809844c 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2103,11 +2103,11 @@ err_action_values_put: static struct devlink_dpipe_table * devlink_dpipe_table_find(struct list_head *dpipe_tables, - const char *table_name) + const char *table_name, struct devlink *devlink) { struct devlink_dpipe_table *table; - - list_for_each_entry_rcu(table, dpipe_tables, list) { + list_for_each_entry_rcu(table, dpipe_tables, list, + lockdep_is_held(&devlink->lock)) { if (!strcmp(table->name, table_name)) return table; } @@ -2226,7 +2226,7 @@ static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) return -EINVAL; @@ -2382,7 +2382,7 @@ static int devlink_dpipe_table_counters_set(struct devlink *devlink, struct devlink_dpipe_table *table; table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) return -EINVAL; @@ -6854,7 +6854,7 @@ bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, rcu_read_lock(); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); enabled = false; if (table) enabled = table->counters_enabled; @@ -6885,7 +6885,8 @@ int devlink_dpipe_table_register(struct devlink *devlink, mutex_lock(&devlink->lock); - if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) { + if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name, + devlink)) { err = -EEXIST; goto unlock; } @@ -6921,7 +6922,7 @@ void devlink_dpipe_table_unregister(struct devlink *devlink, mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) goto unlock; list_del_rcu(&table->list); @@ -7078,7 +7079,7 @@ int devlink_dpipe_table_resource_set(struct devlink *devlink, mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) { err = -EINVAL; goto out; -- cgit v1.2.3 From 1521a67e6016664941f0917d50cb20053a8826a2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 25 Feb 2020 13:54:12 +0100 Subject: sched: act: count in the size of action flags bitfield The put of the flags was added by the commit referenced in fixes tag, however the size of the message was not extended accordingly. Fix this by adding size of the flags bitfield to the message size. Fixes: e38226786022 ("net: sched: update action implementations to support flags") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 90a31b15585f..8c466a712cda 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -186,6 +186,7 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act) + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + cookie_len /* TCA_ACT_COOKIE */ + nla_total_size(0) /* TCA_ACT_STATS nested */ + + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) /* TCA_STATS_PKT64 */ -- cgit v1.2.3 From 51e3dfa8906ace90c809235b3d3afebc166b6433 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 25 Feb 2020 16:34:36 +0100 Subject: net/smc: fix cleanup for linkgroup setup failures If an SMC connection to a certain peer is setup the first time, a new linkgroup is created. In case of setup failures, such a linkgroup is unusable and should disappear. As a first step the linkgroup is removed from the linkgroup list in smc_lgr_forget(). There are 2 problems: smc_listen_decline() might be called before linkgroup creation resulting in a crash due to calling smc_lgr_forget() with parameter NULL. If a setup failure occurs after linkgroup creation, the connection is never unregistered from the linkgroup, preventing linkgroup freeing. This patch introduces an enhanced smc_lgr_cleanup_early() function which * contains a linkgroup check for early smc_listen_decline() invocations * invokes smc_conn_free() to guarantee unregistering of the connection. * schedules fast linkgroup removal of the unusable linkgroup And the unused function smcd_conn_free() is removed from smc_core.h. Fixes: 3b2dec2603d5b ("net/smc: restructure client and server code in af_smc") Fixes: 2a0674fffb6bc ("net/smc: improve abnormal termination of link groups") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 25 +++++++++++++++---------- net/smc/smc_core.c | 12 ++++++++++++ net/smc/smc_core.h | 2 +- 3 files changed, 28 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 90988a511cd5..6fd44bdb0fc3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -512,15 +512,18 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) static int smc_connect_abort(struct smc_sock *smc, int reason_code, int local_contact) { + bool is_smcd = smc->conn.lgr->is_smcd; + if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(smc->conn.lgr); - if (smc->conn.lgr->is_smcd) + smc_lgr_cleanup_early(&smc->conn); + else + smc_conn_free(&smc->conn); + if (is_smcd) /* there is only one lgr role for SMC-D; use server lock */ mutex_unlock(&smc_server_lgr_pending); else mutex_unlock(&smc_client_lgr_pending); - smc_conn_free(&smc->conn); smc->connect_nonblock = 0; return reason_code; } @@ -1091,7 +1094,6 @@ static void smc_listen_out_err(struct smc_sock *new_smc) if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; - smc_conn_free(&new_smc->conn); smc_listen_out(new_smc); } @@ -1102,12 +1104,13 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, { /* RDMA setup failed, switch back to TCP */ if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); + smc_lgr_cleanup_early(&new_smc->conn); + else + smc_conn_free(&new_smc->conn); if (reason_code < 0) { /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } - smc_conn_free(&new_smc->conn); smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = reason_code; if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { @@ -1170,16 +1173,18 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, new_smc->conn.lgr->vlan_id, new_smc->conn.lgr->smcd)) { if (ini->cln_first_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); - smc_conn_free(&new_smc->conn); + smc_lgr_cleanup_early(&new_smc->conn); + else + smc_conn_free(&new_smc->conn); return SMC_CLC_DECL_SMCDNOTALK; } /* Create send and receive buffers */ if (smc_buf_create(new_smc, true)) { if (ini->cln_first_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); - smc_conn_free(&new_smc->conn); + smc_lgr_cleanup_early(&new_smc->conn); + else + smc_conn_free(&new_smc->conn); return SMC_CLC_DECL_MEM; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2249de5379ee..5b085efa3bce 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -162,6 +162,18 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) conn->lgr = NULL; } +void smc_lgr_cleanup_early(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + + if (!lgr) + return; + + smc_conn_free(conn); + smc_lgr_forget(lgr); + smc_lgr_schedule_free_work_fast(lgr); +} + /* Send delete link, either as client to request the initiation * of the DELETE LINK sequence from server; or as server to * initiate the delete processing. See smc_llc_rx_delete_link(). diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c472e12951d1..234ae25f0025 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -296,6 +296,7 @@ struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; void smc_lgr_forget(struct smc_link_group *lgr); +void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate(struct smc_link_group *lgr, bool soft); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, @@ -316,7 +317,6 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); -void smcd_conn_free(struct smc_connection *conn); void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); -- cgit v1.2.3 From b6f6118901d1e867ac9177bbff3b00b185bd4fdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Feb 2020 11:52:29 -0800 Subject: ipv6: restrict IPV6_ADDRFORM operation IPV6_ADDRFORM is able to transform IPv6 socket to IPv4 one. While this operation sounds illogical, we have to support it. One of the things it does for TCP socket is to switch sk->sk_prot to tcp_prot. We now have other layers playing with sk->sk_prot, so we should make sure to not interfere with them. This patch makes sure sk_prot is the default pointer for TCP IPv6 socket. syzbot reported : BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD a0113067 P4D a0113067 PUD a8771067 PMD 0 Oops: 0010 [#1] PREEMPT SMP KASAN CPU: 0 PID: 10686 Comm: syz-executor.0 Not tainted 5.6.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:0x0 Code: Bad RIP value. RSP: 0018:ffffc9000281fce0 EFLAGS: 00010246 RAX: 1ffffffff15f48ac RBX: ffffffff8afa4560 RCX: dffffc0000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a69a8f40 RBP: ffffc9000281fd10 R08: ffffffff86ed9b0c R09: ffffed1014d351f5 R10: ffffed1014d351f5 R11: 0000000000000000 R12: ffff8880920d3098 R13: 1ffff1101241a613 R14: ffff8880a69a8f40 R15: 0000000000000000 FS: 00007f2ae75db700(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000000a3b85000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: inet_release+0x165/0x1c0 net/ipv4/af_inet.c:427 __sock_release net/socket.c:605 [inline] sock_close+0xe1/0x260 net/socket.c:1283 __fput+0x2e4/0x740 fs/file_table.c:280 ____fput+0x15/0x20 fs/file_table.c:313 task_work_run+0x176/0x1b0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop arch/x86/entry/common.c:164 [inline] prepare_exit_to_usermode+0x480/0x5b0 arch/x86/entry/common.c:195 syscall_return_slowpath+0x113/0x4a0 arch/x86/entry/common.c:278 do_syscall_64+0x11f/0x1c0 arch/x86/entry/common.c:304 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x45c429 Code: ad b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f2ae75dac78 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: 0000000000000000 RBX: 00007f2ae75db6d4 RCX: 000000000045c429 RDX: 0000000000000001 RSI: 000000000000011a RDI: 0000000000000004 RBP: 000000000076bf20 R08: 0000000000000038 R09: 0000000000000000 R10: 0000000020000180 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000000a9d R14: 00000000004ccfb4 R15: 000000000076bf2c Modules linked in: CR2: 0000000000000000 ---[ end trace 82567b5207e87bae ]--- RIP: 0010:0x0 Code: Bad RIP value. RSP: 0018:ffffc9000281fce0 EFLAGS: 00010246 RAX: 1ffffffff15f48ac RBX: ffffffff8afa4560 RCX: dffffc0000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a69a8f40 RBP: ffffc9000281fd10 R08: ffffffff86ed9b0c R09: ffffed1014d351f5 R10: ffffed1014d351f5 R11: 0000000000000000 R12: ffff8880920d3098 R13: 1ffff1101241a613 R14: ffff8880a69a8f40 R15: 0000000000000000 FS: 00007f2ae75db700(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000000a3b85000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Eric Dumazet Reported-by: syzbot+1938db17e275e85dc328@syzkaller.appspotmail.com Cc: Daniel Borkmann Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 79fc012dd2ca..debdaeba5d8c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -183,9 +183,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EBUSY; break; } - } else if (sk->sk_protocol != IPPROTO_TCP) + } else if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_prot != &tcpv6_prot) { + retv = -EBUSY; + break; + } break; - + } else { + break; + } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; -- cgit v1.2.3 From dc24f8b4ecd3d6c4153a1ec1bc2006ab32a41b8d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 26 Feb 2020 12:19:03 +0100 Subject: mptcp: add dummy icsk_sync_mss() syzbot noted that the master MPTCP socket lacks the icsk_sync_mss callback, and was able to trigger a null pointer dereference: BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 8e171067 P4D 8e171067 PUD 93fa2067 PMD 0 Oops: 0010 [#1] PREEMPT SMP KASAN CPU: 0 PID: 8984 Comm: syz-executor066 Not tainted 5.6.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:0x0 Code: Bad RIP value. RSP: 0018:ffffc900020b7b80 EFLAGS: 00010246 RAX: 1ffff110124ba600 RBX: 0000000000000000 RCX: ffff88809fefa600 RDX: ffff8880994cdb18 RSI: 0000000000000000 RDI: ffff8880925d3140 RBP: ffffc900020b7bd8 R08: ffffffff870225be R09: fffffbfff140652a R10: fffffbfff140652a R11: 0000000000000000 R12: ffff8880925d35d0 R13: ffff8880925d3140 R14: dffffc0000000000 R15: 1ffff110124ba6ba FS: 0000000001a0b880(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000000a6d6f000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: cipso_v4_sock_setattr+0x34b/0x470 net/ipv4/cipso_ipv4.c:1888 netlbl_sock_setattr+0x2a7/0x310 net/netlabel/netlabel_kapi.c:989 smack_netlabel security/smack/smack_lsm.c:2425 [inline] smack_inode_setsecurity+0x3da/0x4a0 security/smack/smack_lsm.c:2716 security_inode_setsecurity+0xb2/0x140 security/security.c:1364 __vfs_setxattr_noperm+0x16f/0x3e0 fs/xattr.c:197 vfs_setxattr fs/xattr.c:224 [inline] setxattr+0x335/0x430 fs/xattr.c:451 __do_sys_fsetxattr fs/xattr.c:506 [inline] __se_sys_fsetxattr+0x130/0x1b0 fs/xattr.c:495 __x64_sys_fsetxattr+0xbf/0xd0 fs/xattr.c:495 do_syscall_64+0xf7/0x1c0 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x440199 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffcadc19e48 EFLAGS: 00000246 ORIG_RAX: 00000000000000be RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440199 RDX: 0000000020000200 RSI: 00000000200001c0 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000003 R09: 00000000004002c8 R10: 0000000000000009 R11: 0000000000000246 R12: 0000000000401a20 R13: 0000000000401ab0 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: CR2: 0000000000000000 Address the issue adding a dummy icsk_sync_mss callback. To properly sync the subflows mss and options list we need some additional infrastructure, which will land to net-next. Reported-by: syzbot+f4dfece964792d80b139@syzkaller.appspotmail.com Fixes: 2303f994b3e1 ("mptcp: Associate MPTCP context with TCP socket") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e9aa6807b5be..3c19a8efdcea 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -543,6 +543,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, } } +static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) +{ + return 0; +} + static int __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -551,6 +556,7 @@ static int __mptcp_init_sock(struct sock *sk) __set_bit(MPTCP_SEND_SPACE, &msk->flags); msk->first = NULL; + inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; return 0; } -- cgit v1.2.3 From a2f2ef4a54c0d97aa6a8386f4ff23f36ebb488cf Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 26 Feb 2020 17:52:46 +0100 Subject: net/smc: check for valid ib_client_data In smc_ib_remove_dev() check if the provided ib device was actually initialized for SMC before. Reported-by: syzbot+84484ccebdd4e5451d91@syzkaller.appspotmail.com Fixes: a4cf0443c414 ("smc: introduce SMC as an IB-client") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 548632621f4b..d6ba186f67e2 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -573,6 +573,8 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) struct smc_ib_device *smcibdev; smcibdev = ib_get_client_data(ibdev, &smc_ib_client); + if (!smcibdev || smcibdev->ibdev != ibdev) + return; ib_set_client_data(ibdev, &smc_ib_client, NULL); spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ -- cgit v1.2.3 From 3a12500ed5dd21a63da779ac73503f11085bbc1c Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 26 Feb 2020 18:29:53 +0100 Subject: unix: define and set show_fdinfo only if procfs is enabled Follow the pattern used with other *_show_fdinfo functions and only define unix_show_fdinfo and set it in proto_ops if CONFIG_PROCFS is set. Fixes: 3c32da19a858 ("unix: Show number of pending scm files of receive queue in fdinfo") Signed-off-by: Tobias Klauser Reviewed-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/unix/af_unix.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 62c12cb5763e..aa6e2530e1ec 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -682,6 +682,7 @@ static int unix_set_peek_off(struct sock *sk, int val) return 0; } +#ifdef CONFIG_PROCFS static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; @@ -692,6 +693,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds)); } } +#else +#define unix_show_fdinfo NULL +#endif static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, -- cgit v1.2.3 From 5c05a164d441a1792791175e4959ea9df12f7e2b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 27 Feb 2020 11:52:35 -0800 Subject: unix: It's CONFIG_PROC_FS not CONFIG_PROCFS Fixes: 3a12500ed5dd ("unix: define and set show_fdinfo only if procfs is enabled") Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index aa6e2530e1ec..68debcb28fa4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -682,7 +682,7 @@ static int unix_set_peek_off(struct sock *sk, int val) return 0; } -#ifdef CONFIG_PROCFS +#ifdef CONFIG_PROC_FS static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; -- cgit v1.2.3 From 3f74957fcbeab703297ed0f135430414ed7e0dd0 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 26 Feb 2020 11:58:18 +0100 Subject: vsock: fix potential deadlock in transport->release() Some transports (hyperv, virtio) acquire the sock lock during the .release() callback. In the vsock_stream_connect() we call vsock_assign_transport(); if the socket was previously assigned to another transport, the vsk->transport->release() is called, but the sock lock is already held in the vsock_stream_connect(), causing a deadlock reported by syzbot: INFO: task syz-executor280:9768 blocked for more than 143 seconds. Not tainted 5.6.0-rc1-syzkaller #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor280 D27912 9768 9766 0x00000000 Call Trace: context_switch kernel/sched/core.c:3386 [inline] __schedule+0x934/0x1f90 kernel/sched/core.c:4082 schedule+0xdc/0x2b0 kernel/sched/core.c:4156 __lock_sock+0x165/0x290 net/core/sock.c:2413 lock_sock_nested+0xfe/0x120 net/core/sock.c:2938 virtio_transport_release+0xc4/0xd60 net/vmw_vsock/virtio_transport_common.c:832 vsock_assign_transport+0xf3/0x3b0 net/vmw_vsock/af_vsock.c:454 vsock_stream_connect+0x2b3/0xc70 net/vmw_vsock/af_vsock.c:1288 __sys_connect_file+0x161/0x1c0 net/socket.c:1857 __sys_connect+0x174/0x1b0 net/socket.c:1874 __do_sys_connect net/socket.c:1885 [inline] __se_sys_connect net/socket.c:1882 [inline] __x64_sys_connect+0x73/0xb0 net/socket.c:1882 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe To avoid this issue, this patch remove the lock acquiring in the .release() callback of hyperv and virtio transports, and it holds the lock when we call vsk->transport->release() in the vsock core. Reported-by: syzbot+731710996d79d0d58fbc@syzkaller.appspotmail.com Fixes: 408624af4c89 ("vsock: use local transport when it is loaded") Signed-off-by: Stefano Garzarella Reviewed-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 20 ++++++++++++-------- net/vmw_vsock/hyperv_transport.c | 3 --- net/vmw_vsock/virtio_transport_common.c | 2 -- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9c5b2a91baad..a5f28708e0e7 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -451,6 +451,12 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) if (vsk->transport == new_transport) return 0; + /* transport->release() must be called with sock lock acquired. + * This path can only be taken during vsock_stream_connect(), + * where we have already held the sock lock. + * In the other cases, this function is called on a new socket + * which is not assigned to any transport. + */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); } @@ -753,20 +759,18 @@ static void __vsock_release(struct sock *sk, int level) vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ - /* The release call is supposed to use lock_sock_nested() - * rather than lock_sock(), if a sock lock should be acquired. - */ - if (vsk->transport) - vsk->transport->release(vsk); - else if (sk->sk_type == SOCK_STREAM) - vsock_remove_sock(vsk); - /* When "level" is SINGLE_DEPTH_NESTING, use the nested * version to avoid the warning "possible recursive locking * detected". When "level" is 0, lock_sock_nested(sk, level) * is the same as lock_sock(sk). */ lock_sock_nested(sk, level); + + if (vsk->transport) + vsk->transport->release(vsk); + else if (sk->sk_type == SOCK_STREAM) + vsock_remove_sock(vsk); + sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 3492c021925f..630b851f8150 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -526,12 +526,9 @@ static bool hvs_close_lock_held(struct vsock_sock *vsk) static void hvs_release(struct vsock_sock *vsk) { - struct sock *sk = sk_vsock(vsk); bool remove_sock; - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); remove_sock = hvs_close_lock_held(vsk); - release_sock(sk); if (remove_sock) vsock_remove_sock(vsk); } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index d9f0c9c5425a..f3c4bab2f737 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -829,7 +829,6 @@ void virtio_transport_release(struct vsock_sock *vsk) struct sock *sk = &vsk->sk; bool remove_sock = true; - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_type == SOCK_STREAM) remove_sock = virtio_transport_close(vsk); @@ -837,7 +836,6 @@ void virtio_transport_release(struct vsock_sock *vsk) list_del(&pkt->list); virtio_transport_free_pkt(pkt); } - release_sock(sk); if (remove_sock) vsock_remove_sock(vsk); -- cgit v1.2.3 From 84b3268027641401bb8ad4427a90a3cce2eb86f5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 26 Feb 2020 19:47:34 +0100 Subject: netlink: Use netlink header as base to calculate bad attribute offset Userspace might send a batch that is composed of several netlink messages. The netlink_ack() function must use the pointer to the netlink header as base to calculate the bad attribute offset. Fixes: 2d4bc93368f5 ("netlink: extended ACK reporting") Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index edf3e285e242..5313f1cec170 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2434,7 +2434,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, in_skb->len)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, (u8 *)extack->bad_attr - - in_skb->data)); + (u8 *)nlh)); } else { if (extack->cookie_len) WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, -- cgit v1.2.3 From 07758eb9ff52794fba15d03aa88d92dbd1b7d125 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sat, 29 Feb 2020 17:27:13 +0800 Subject: net/ipv6: use configured metric when add peer route When we add peer address with metric configured, IPv4 could set the dest metric correctly, but IPv6 do not. e.g. ]# ip addr add 192.0.2.1 peer 192.0.2.2/32 dev eth1 metric 20 ]# ip route show dev eth1 192.0.2.2 proto kernel scope link src 192.0.2.1 metric 20 ]# ip addr add 2001:db8::1 peer 2001:db8::2/128 dev eth1 metric 20 ]# ip -6 route show dev eth1 2001:db8::1 proto kernel metric 20 pref medium 2001:db8::2 proto kernel metric 256 pref medium Fix this by using configured metric instead of default one. Reported-by: Jianlin Shi Fixes: 8308f3ff1753 ("net/ipv6: Add support for specifying metric of connected routes") Reviewed-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index cb493e15959c..164c71c54b5c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5983,9 +5983,9 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); if (!ipv6_addr_any(&ifp->peer_addr)) - addrconf_prefix_route(&ifp->peer_addr, 128, 0, - ifp->idev->dev, 0, 0, - GFP_ATOMIC); + addrconf_prefix_route(&ifp->peer_addr, 128, + ifp->rt_priority, ifp->idev->dev, + 0, 0, GFP_ATOMIC); break; case RTM_DELADDR: if (ifp->idev->cnf.forwarding) -- cgit v1.2.3 From 8750939b6ad86abc3f53ec8a9683a1cded4a5654 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:11 -0800 Subject: devlink: validate length of param values DEVLINK_ATTR_PARAM_VALUE_DATA may have different types so it's not checked by the normal netlink policy. Make sure the attribute length is what we expect. Fixes: e3b7ca18ad7b ("devlink: Add param set command") Signed-off-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 5e220809844c..8e44dc5cde73 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3352,34 +3352,41 @@ devlink_param_value_get_from_info(const struct devlink_param *param, struct genl_info *info, union devlink_param_value *value) { + struct nlattr *param_data; int len; - if (param->type != DEVLINK_PARAM_TYPE_BOOL && - !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) + param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]; + + if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data) return -EINVAL; switch (param->type) { case DEVLINK_PARAM_TYPE_U8: - value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + if (nla_len(param_data) != sizeof(u8)) + return -EINVAL; + value->vu8 = nla_get_u8(param_data); break; case DEVLINK_PARAM_TYPE_U16: - value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + if (nla_len(param_data) != sizeof(u16)) + return -EINVAL; + value->vu16 = nla_get_u16(param_data); break; case DEVLINK_PARAM_TYPE_U32: - value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + if (nla_len(param_data) != sizeof(u32)) + return -EINVAL; + value->vu32 = nla_get_u32(param_data); break; case DEVLINK_PARAM_TYPE_STRING: - len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]), - nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])); - if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) || + len = strnlen(nla_data(param_data), nla_len(param_data)); + if (len == nla_len(param_data) || len >= __DEVLINK_PARAM_MAX_STRING_VALUE) return -EINVAL; - strcpy(value->vstr, - nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])); + strcpy(value->vstr, nla_data(param_data)); break; case DEVLINK_PARAM_TYPE_BOOL: - value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ? - true : false; + if (param_data && nla_len(param_data)) + return -EINVAL; + value->vbool = nla_get_flag(param_data); break; } return 0; -- cgit v1.2.3 From ff3b63b8c299b73ac599b120653b47e275407656 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:12 -0800 Subject: devlink: validate length of region addr/len DEVLINK_ATTR_REGION_CHUNK_ADDR and DEVLINK_ATTR_REGION_CHUNK_LEN lack entries in the netlink policy. Corresponding nla_get_u64()s may read beyond the end of the message. Fixes: 4e54795a27f5 ("devlink: Add support for region snapshot read command") Signed-off-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 8e44dc5cde73..b831c5545d6a 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5958,6 +5958,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 }, + [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 }, [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, -- cgit v1.2.3 From 9322cd7c4af2ccc7fe7c5f01adb53f4f77949e92 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:14 -0800 Subject: nl802154: add missing attribute validation Add missing attribute validation for several u8 types. Fixes: 2c21d11518b6 ("net: add NL802154 interface for configuration of 802.15.4 devices") Signed-off-by: Jakub Kicinski Acked-by: Stefan Schmidt Signed-off-by: David S. Miller --- net/ieee802154/nl_policy.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c index 2c7a38d76a3a..824e7e84014c 100644 --- a/net/ieee802154/nl_policy.c +++ b/net/ieee802154/nl_policy.c @@ -21,6 +21,11 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = { [IEEE802154_ATTR_HW_ADDR] = { .type = NLA_HW_ADDR, }, [IEEE802154_ATTR_PAN_ID] = { .type = NLA_U16, }, [IEEE802154_ATTR_CHANNEL] = { .type = NLA_U8, }, + [IEEE802154_ATTR_BCN_ORD] = { .type = NLA_U8, }, + [IEEE802154_ATTR_SF_ORD] = { .type = NLA_U8, }, + [IEEE802154_ATTR_PAN_COORD] = { .type = NLA_U8, }, + [IEEE802154_ATTR_BAT_EXT] = { .type = NLA_U8, }, + [IEEE802154_ATTR_COORD_REALIGN] = { .type = NLA_U8, }, [IEEE802154_ATTR_PAGE] = { .type = NLA_U8, }, [IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, }, [IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, }, -- cgit v1.2.3 From b60673c4c418bef7550d02faf53c34fbfeb366bf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:15 -0800 Subject: nl802154: add missing attribute validation for dev_type Add missing attribute type validation for IEEE802154_ATTR_DEV_TYPE to the netlink policy. Fixes: 90c049b2c6ae ("ieee802154: interface type to be added") Signed-off-by: Jakub Kicinski Acked-by: Stefan Schmidt Signed-off-by: David S. Miller --- net/ieee802154/nl_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c index 824e7e84014c..0672b2f01586 100644 --- a/net/ieee802154/nl_policy.c +++ b/net/ieee802154/nl_policy.c @@ -27,6 +27,7 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = { [IEEE802154_ATTR_BAT_EXT] = { .type = NLA_U8, }, [IEEE802154_ATTR_COORD_REALIGN] = { .type = NLA_U8, }, [IEEE802154_ATTR_PAGE] = { .type = NLA_U8, }, + [IEEE802154_ATTR_DEV_TYPE] = { .type = NLA_U8, }, [IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, }, [IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, }, [IEEE802154_ATTR_COORD_PAN_ID] = { .type = NLA_U16, }, -- cgit v1.2.3 From b5ab1f1be6180a2e975eede18731804b5164a05d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:18 -0800 Subject: openvswitch: add missing attribute validation for hash Add missing attribute validation for OVS_PACKET_ATTR_HASH to the netlink policy. Fixes: bd1903b7c459 ("net: openvswitch: add hash info to upcall") Signed-off-by: Jakub Kicinski Reviewed-by: Greg Rose Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index c047afd12116..07a7dd185995 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -645,6 +645,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, + [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, }; static const struct genl_ops dp_packet_genl_ops[] = { -- cgit v1.2.3 From 7e6dc03eeb023e18427a373522f1d247b916a641 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:19 -0800 Subject: net: fq: add missing attribute validation for orphan mask Add missing attribute validation for TCA_FQ_ORPHAN_MASK to the netlink policy. Fixes: 06eb395fa985 ("pkt_sched: fq: better control of DDOS traffic") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a5a295477ecc..371ad84def3b 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -744,6 +744,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, + [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, }; -- cgit v1.2.3 From e13aaa0643da10006ec35715954e7f92a62899a5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:20 -0800 Subject: net: taprio: add missing attribute validation for txtime delay Add missing attribute validation for TCA_TAPRIO_ATTR_TXTIME_DELAY to the netlink policy. Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode") Signed-off-by: Jakub Kicinski Reviewed-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 660fc45ee40f..ee717e05372b 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -768,6 +768,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, + [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, }; static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, -- cgit v1.2.3 From 213320a67962ff6e7b83b704d55cbebc341426db Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:23 -0800 Subject: tipc: add missing attribute validation for MTU property Add missing attribute validation for TIPC_NLA_PROP_MTU to the netlink policy. Fixes: 901271e0403a ("tipc: implement configuration of UDP media MTU") Signed-off-by: Jakub Kicinski Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tipc/netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 7c35094c20b8..bb9862410e68 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -116,6 +116,7 @@ const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 }, [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 }, [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_MTU] = { .type = NLA_U32 }, [TIPC_NLA_PROP_BROADCAST] = { .type = NLA_U32 }, [TIPC_NLA_PROP_BROADCAST_RATIO] = { .type = NLA_U32 } }; -- cgit v1.2.3 From 361d23e41ca6e504033f7e66a03b95788377caae Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:24 -0800 Subject: nfc: add missing attribute validation for SE API Add missing attribute validation for NFC_ATTR_SE_INDEX to the netlink policy. Fixes: 5ce3f32b5264 ("NFC: netlink: SE API implementation") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/nfc/netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index eee0dddb7749..842407a48f96 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -43,6 +43,7 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, + [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, -- cgit v1.2.3 From 88e706d5168b07df4792dbc3d1bc37b83e4bd74d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:25 -0800 Subject: nfc: add missing attribute validation for deactivate target Add missing attribute validation for NFC_ATTR_TARGET_INDEX to the netlink policy. Fixes: 4d63adfe12dd ("NFC: Add NFC_CMD_DEACTIVATE_TARGET support") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/nfc/netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 842407a48f96..e988ca486d66 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -32,6 +32,7 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING, .len = NFC_DEVICE_NAME_MAXSIZE }, [NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 }, + [NFC_ATTR_TARGET_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_COMM_MODE] = { .type = NLA_U8 }, [NFC_ATTR_RF_MODE] = { .type = NLA_U8 }, [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 }, -- cgit v1.2.3 From 6ba3da446551f2150fadbf8c7788edcb977683d3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:05:26 -0800 Subject: nfc: add missing attribute validation for vendor subcommand Add missing attribute validation for vendor subcommand attributes to the netlink policy. Fixes: 9e58095f9660 ("NFC: netlink: Implement vendor command support") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/nfc/netlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index e988ca486d66..e894254c17d4 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -46,6 +46,8 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { .len = NFC_FIRMWARE_NAME_MAXSIZE }, [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, + [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 }, + [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, }; -- cgit v1.2.3 From 617940123e0140521f3080d2befc2bf55bcda094 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 3 Mar 2020 14:37:34 +0800 Subject: net/ipv6: need update peer route when modify metric When we modify the route metric, the peer address's route need also be updated. Before the fix: + ip addr add dev dummy1 2001:db8::1 peer 2001:db8::2 metric 60 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 60 pref medium 2001:db8::2 proto kernel metric 60 pref medium + ip addr change dev dummy1 2001:db8::1 peer 2001:db8::2 metric 61 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 61 pref medium 2001:db8::2 proto kernel metric 60 pref medium After the fix: + ip addr change dev dummy1 2001:db8::1 peer 2001:db8::2 metric 61 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 61 pref medium 2001:db8::2 proto kernel metric 61 pref medium Fixes: 8308f3ff1753 ("net/ipv6: Add support for specifying metric of connected routes") Signed-off-by: Hangbin Liu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 164c71c54b5c..4fb72028ca45 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4586,12 +4586,14 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, } static int modify_prefix_route(struct inet6_ifaddr *ifp, - unsigned long expires, u32 flags) + unsigned long expires, u32 flags, + bool modify_peer) { struct fib6_info *f6i; u32 prio; - f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, + f6i = addrconf_get_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr, + ifp->prefix_len, ifp->idev->dev, 0, RTF_DEFAULT, true); if (!f6i) return -ENOENT; @@ -4602,7 +4604,8 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, ip6_del_rt(dev_net(ifp->idev->dev), f6i); /* add new one */ - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr, + ifp->prefix_len, ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); } else { @@ -4678,7 +4681,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) int rc = -ENOENT; if (had_prefixroute) - rc = modify_prefix_route(ifp, expires, flags); + rc = modify_prefix_route(ifp, expires, flags, false); /* prefix route could have been deleted; if so restore it */ if (rc == -ENOENT) { @@ -4686,6 +4689,15 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); } + + if (had_prefixroute && !ipv6_addr_any(&ifp->peer_addr)) + rc = modify_prefix_route(ifp, expires, flags, true); + + if (rc == -ENOENT && !ipv6_addr_any(&ifp->peer_addr)) { + addrconf_prefix_route(&ifp->peer_addr, ifp->prefix_len, + ifp->rt_priority, ifp->idev->dev, + expires, flags, GFP_KERNEL); + } } else if (had_prefixroute) { enum cleanup_prefix_rt_t action; unsigned long rt_expires; -- cgit v1.2.3 From d0098e4c6b83e502cc1cd96d67ca86bc79a6c559 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 3 Mar 2020 14:37:35 +0800 Subject: net/ipv6: remove the old peer route if change it to a new one When we modify the peer route and changed it to a new one, we should remove the old route first. Before the fix: + ip addr add dev dummy1 2001:db8::1 peer 2001:db8::2 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 256 pref medium 2001:db8::2 proto kernel metric 256 pref medium + ip addr change dev dummy1 2001:db8::1 peer 2001:db8::3 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 256 pref medium 2001:db8::2 proto kernel metric 256 pref medium After the fix: + ip addr change dev dummy1 2001:db8::1 peer 2001:db8::3 + ip -6 route show dev dummy1 2001:db8::1 proto kernel metric 256 pref medium 2001:db8::3 proto kernel metric 256 pref medium This patch depend on the previous patch "net/ipv6: need update peer route when modify metric" to update new peer route after delete old one. Signed-off-by: Hangbin Liu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4fb72028ca45..e6e1290ea06f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1226,11 +1226,13 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) } static void -cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) +cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, + bool del_rt, bool del_peer) { struct fib6_info *f6i; - f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, + f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr, + ifp->prefix_len, ifp->idev->dev, 0, RTF_DEFAULT, true); if (f6i) { if (del_rt) @@ -1293,7 +1295,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (action != CLEANUP_PREFIX_RT_NOP) { cleanup_prefix_route(ifp, expires, - action == CLEANUP_PREFIX_RT_DEL); + action == CLEANUP_PREFIX_RT_DEL, false); } /* clean up prefsrc entries */ @@ -4627,6 +4629,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) unsigned long timeout; bool was_managetempaddr; bool had_prefixroute; + bool new_peer = false; ASSERT_RTNL(); @@ -4658,6 +4661,13 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) cfg->preferred_lft = timeout; } + if (cfg->peer_pfx && + memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) { + if (!ipv6_addr_any(&ifp->peer_addr)) + cleanup_prefix_route(ifp, expires, true, true); + new_peer = true; + } + spin_lock_bh(&ifp->lock); was_managetempaddr = ifp->flags & IFA_F_MANAGETEMPADDR; had_prefixroute = ifp->flags & IFA_F_PERMANENT && @@ -4673,6 +4683,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority) ifp->rt_priority = cfg->rt_priority; + if (new_peer) + ifp->peer_addr = *cfg->peer_pfx; + spin_unlock_bh(&ifp->lock); if (!(ifp->flags&IFA_F_TENTATIVE)) ipv6_ifa_notify(0, ifp); @@ -4708,7 +4721,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) if (action != CLEANUP_PREFIX_RT_NOP) { cleanup_prefix_route(ifp, rt_expires, - action == CLEANUP_PREFIX_RT_DEL); + action == CLEANUP_PREFIX_RT_DEL, false); } } -- cgit v1.2.3 From 8640f8dc6d657ebfb4e67c202ad32c5457858a13 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 3 Mar 2020 15:01:46 +0000 Subject: net: dsa: fix phylink_start()/phylink_stop() calls Place phylink_start()/phylink_stop() inside dsa_port_enable() and dsa_port_disable(), which ensures that we call phylink_stop() before tearing down phylink - which is a documented requirement. Failure to do so can cause use-after-free bugs. Fixes: 0e27921816ad ("net: dsa: Use PHYLINK for the CPU/DSA ports") Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 2 ++ net/dsa/port.c | 32 ++++++++++++++++++++++++++------ net/dsa/slave.c | 8 ++------ 3 files changed, 30 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index a7662e7a691d..760e6ea3178a 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -117,7 +117,9 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); +int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy); int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); +void dsa_port_disable_rt(struct dsa_port *dp); void dsa_port_disable(struct dsa_port *dp); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); diff --git a/net/dsa/port.c b/net/dsa/port.c index 774facb8d547..ed7dabb57985 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -63,7 +63,7 @@ static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } -int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) +int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy) { struct dsa_switch *ds = dp->ds; int port = dp->index; @@ -78,14 +78,31 @@ int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) if (!dp->bridge_dev) dsa_port_set_state_now(dp, BR_STATE_FORWARDING); + if (dp->pl) + phylink_start(dp->pl); + return 0; } -void dsa_port_disable(struct dsa_port *dp) +int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) +{ + int err; + + rtnl_lock(); + err = dsa_port_enable_rt(dp, phy); + rtnl_unlock(); + + return err; +} + +void dsa_port_disable_rt(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; int port = dp->index; + if (dp->pl) + phylink_stop(dp->pl); + if (!dp->bridge_dev) dsa_port_set_state_now(dp, BR_STATE_DISABLED); @@ -93,6 +110,13 @@ void dsa_port_disable(struct dsa_port *dp) ds->ops->port_disable(ds, port); } +void dsa_port_disable(struct dsa_port *dp) +{ + rtnl_lock(); + dsa_port_disable_rt(dp); + rtnl_unlock(); +} + int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { @@ -614,10 +638,6 @@ static int dsa_port_phylink_register(struct dsa_port *dp) goto err_phy_connect; } - rtnl_lock(); - phylink_start(dp->pl); - rtnl_unlock(); - return 0; err_phy_connect: diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 088c886e609e..ddc0f9236928 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -88,12 +88,10 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - err = dsa_port_enable(dp, dev->phydev); + err = dsa_port_enable_rt(dp, dev->phydev); if (err) goto clear_promisc; - phylink_start(dp->pl); - return 0; clear_promisc: @@ -114,9 +112,7 @@ static int dsa_slave_close(struct net_device *dev) struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); - phylink_stop(dp->pl); - - dsa_port_disable(dp); + dsa_port_disable_rt(dp); dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); -- cgit v1.2.3 From dc15af8e9dbd039ebb06336597d2c491ef46ab74 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 25 Feb 2020 10:05:47 +0300 Subject: netfilter: nf_conntrack: ct_cpu_seq_next should increase position index If .next function does not change position index, following .show function will repeat output related to current position index. Cc: stable@vger.kernel.org Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code ...") Link: https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_standalone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 410809c669e1..4912069627b6 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -411,7 +411,7 @@ static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } - + (*pos)++; return NULL; } -- cgit v1.2.3 From bb71f846a0002239f7058c84f1496648ff4a5c20 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 25 Feb 2020 10:05:59 +0300 Subject: netfilter: synproxy: synproxy_cpu_seq_next should increase position index If .next function does not change position index, following .show function will repeat output related to current position index. Cc: stable@vger.kernel.org Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code ...") Link: https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_synproxy_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index b0930d4aba22..b9cbe1e2453e 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -267,7 +267,7 @@ static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu + 1; return per_cpu_ptr(snet->stats, cpu); } - + (*pos)++; return NULL; } -- cgit v1.2.3 From db25517a550926f609c63054b12ea9ad515e1a10 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 25 Feb 2020 10:06:29 +0300 Subject: netfilter: xt_recent: recent_seq_next should increase position index If .next function does not change position index, following .show function will repeat output related to current position index. Without the patch: # dd if=/proc/net/xt_recent/SSH # original file outpt src=127.0.0.4 ttl: 0 last_seen: 6275444819 oldest_pkt: 1 6275444819 src=127.0.0.2 ttl: 0 last_seen: 6275438906 oldest_pkt: 1 6275438906 src=127.0.0.3 ttl: 0 last_seen: 6275441953 oldest_pkt: 1 6275441953 0+1 records in 0+1 records out 204 bytes copied, 6.1332e-05 s, 3.3 MB/s Read after lseek into middle of last line (offset 140 in example below) generates expected end of last line and then unexpected whole last line once again # dd if=/proc/net/xt_recent/SSH bs=140 skip=1 dd: /proc/net/xt_recent/SSH: cannot skip to specified offset 127.0.0.3 ttl: 0 last_seen: 6275441953 oldest_pkt: 1 6275441953 src=127.0.0.3 ttl: 0 last_seen: 6275441953 oldest_pkt: 1 6275441953 0+1 records in 0+1 records out 132 bytes copied, 6.2487e-05 s, 2.1 MB/s Cc: stable@vger.kernel.org Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code ...") Link: https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_recent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 0a9708004e20..225a7ab6d79a 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -492,12 +492,12 @@ static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) const struct recent_entry *e = v; const struct list_head *head = e->list.next; + (*pos)++; while (head == &t->iphash[st->bucket]) { if (++st->bucket >= ip_list_hash_size) return NULL; head = t->iphash[st->bucket].next; } - (*pos)++; return list_entry(head, struct recent_entry, list); } -- cgit v1.2.3 From ee84f19cbbe9cf7cba2958acb03163fed3ecbb0f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 25 Feb 2020 10:07:12 +0300 Subject: netfilter: x_tables: xt_mttg_seq_next should increase position index If .next function does not change position index, following .show function will repeat output related to current position index. Without patch: # dd if=/proc/net/ip_tables_matches # original file output conntrack conntrack conntrack recent recent icmp udplite udp tcp 0+1 records in 0+1 records out 65 bytes copied, 5.4074e-05 s, 1.2 MB/s # dd if=/proc/net/ip_tables_matches bs=62 skip=1 dd: /proc/net/ip_tables_matches: cannot skip to specified offset cp <<< end of last line tcp <<< and then unexpected whole last line once again 0+1 records in 0+1 records out 7 bytes copied, 0.000102447 s, 68.3 kB/s Cc: stable@vger.kernel.org Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code ...") Link: https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index e27c6c5ba9df..cd2b034eef59 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1551,6 +1551,9 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; + if (ppos != NULL) + ++(*ppos); + switch (trav->class) { case MTTG_TRAV_INIT: trav->class = MTTG_TRAV_NFP_UNSPEC; @@ -1576,9 +1579,6 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, default: return NULL; } - - if (ppos != NULL) - ++*ppos; return trav; } -- cgit v1.2.3 From 2d285f26ecd072800a29c5b71e63437f21ef830a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 2 Mar 2020 21:58:50 +0100 Subject: netfilter: nf_tables: free flowtable hooks on hook register error If hook registration fails, the hooks allocated via nft_netdev_hook_alloc need to be freed. We can't change the goto label to 'goto 5' -- while it does fix the memleak it does cause a warning splat from the netfilter core (the hooks were not registered). Fixes: 3f0465a9ef02 ("netfilter: nf_tables: dynamically allocate hooks per net_device in flowtables") Reported-by: syzbot+a2ff6fa45162a5ed4dd3@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d1318bdf49ca..bb064aa4154b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6300,8 +6300,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err4; err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); - if (err < 0) + if (err < 0) { + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } goto err4; + } err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) -- cgit v1.2.3 From c049b3450072b8e3998053490e025839fecfef31 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:08:31 -0800 Subject: netfilter: cthelper: add missing attribute validation for cthelper Add missing attribute validation for cthelper to the netlink policy. Fixes: 12f7a505331e ("netfilter: add user-space connection tracking helper infrastructure") Signed-off-by: Jakub Kicinski Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cthelper.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index de3a9596b7f1..a5f294aa8e4c 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -742,6 +742,8 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { [NFCTH_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN-1 }, [NFCTH_QUEUE_NUM] = { .type = NLA_U32, }, + [NFCTH_PRIV_DATA_LEN] = { .type = NLA_U32, }, + [NFCTH_STATUS] = { .type = NLA_U32, }, }; static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { -- cgit v1.2.3 From 9d6effb2f1523eb84516e44213c00f2fd9e6afff Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:08:32 -0800 Subject: netfilter: nft_payload: add missing attribute validation for payload csum flags Add missing attribute validation for NFTA_PAYLOAD_CSUM_FLAGS to the netlink policy. Fixes: 1814096980bb ("netfilter: nft_payload: layer 4 checksum adjustment for pseudoheader fields") Signed-off-by: Jakub Kicinski Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 1993af3a2979..a7de3a58f553 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -129,6 +129,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 }, }; static int nft_payload_init(const struct nft_ctx *ctx, -- cgit v1.2.3 From 88a637719a1570705c02cacb3297af164b1714e7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:08:33 -0800 Subject: netfilter: nft_tunnel: add missing attribute validation for tunnels Add missing attribute validation for tunnel source and destination ports to the netlink policy. Fixes: af308b94a2a4 ("netfilter: nf_tables: add tunnel support") Signed-off-by: Jakub Kicinski Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tunnel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 4c3f2e24c7cb..764e88682a81 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -339,6 +339,8 @@ static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] = [NFTA_TUNNEL_KEY_FLAGS] = { .type = NLA_U32, }, [NFTA_TUNNEL_KEY_TOS] = { .type = NLA_U8, }, [NFTA_TUNNEL_KEY_TTL] = { .type = NLA_U8, }, + [NFTA_TUNNEL_KEY_SPORT] = { .type = NLA_U16, }, + [NFTA_TUNNEL_KEY_DPORT] = { .type = NLA_U16, }, [NFTA_TUNNEL_KEY_OPTS] = { .type = NLA_NESTED, }, }; -- cgit v1.2.3 From d78008de6103c708171baff9650a7862645d23b0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 3 Mar 2020 15:02:45 +0100 Subject: netfilter: nf_tables: dump NFTA_CHAIN_FLAGS attribute Missing NFTA_CHAIN_FLAGS netlink attribute when dumping basechain definitions. Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bb064aa4154b..f9e60981bd36 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1405,6 +1405,11 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, lockdep_commit_lock_is_held(net)); if (nft_dump_stats(skb, stats)) goto nla_put_failure; + + if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) && + nla_put_be32(skb, NFTA_CHAIN_FLAGS, + htonl(NFT_CHAIN_HW_OFFLOAD))) + goto nla_put_failure; } if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) -- cgit v1.2.3 From 1d305ba40eb8081ff21eeb8ca6ba5c70fd920934 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 5 Mar 2020 11:15:36 +0100 Subject: netfilter: nf_tables: fix infinite loop when expr is not available nft will loop forever if the kernel doesn't support an expression: 1. nft_expr_type_get() appends the family specific name to the module list. 2. -EAGAIN is returned to nfnetlink, nfnetlink calls abort path. 3. abort path sets ->done to true and calls request_module for the expression. 4. nfnetlink replays the batch, we end up in nft_expr_type_get() again. 5. nft_expr_type_get attempts to append family-specific name. This one already exists on the list, so we continue 6. nft_expr_type_get adds the generic expression name to the module list. -EAGAIN is returned, nfnetlink calls abort path. 7. abort path encounters the family-specific expression which has 'done' set, so it gets removed. 8. abort path requests the generic expression name, sets done to true. 9. batch is replayed. If the expression could not be loaded, then we will end up back at 1), because the family-specific name got removed and the cycle starts again. Note that userspace can SIGKILL the nft process to stop the cycle, but the desired behaviour is to return an error after the generic expr name fails to load the expression. Fixes: eb014de4fd418 ("netfilter: nf_tables: autoload modules from the abort path") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f9e60981bd36..38c680f28f15 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7388,13 +7388,8 @@ static void nf_tables_module_autoload(struct net *net) list_splice_init(&net->nft.module_list, &module_list); mutex_unlock(&net->nft.commit_mutex); list_for_each_entry_safe(req, next, &module_list, list) { - if (req->done) { - list_del(&req->list); - kfree(req); - } else { - request_module("%s", req->module); - req->done = true; - } + request_module("%s", req->module); + req->done = true; } mutex_lock(&net->nft.commit_mutex); list_splice(&module_list, &net->nft.module_list); @@ -8177,6 +8172,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) __nft_release_tables(net); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); + WARN_ON_ONCE(!list_empty(&net->nft.module_list)); } static struct pernet_operations nf_tables_net_ops = { -- cgit v1.2.3 From a3aefbfe45751bf7b338c181b97608e276b5bb73 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Mar 2020 17:24:31 +0300 Subject: net: nfc: fix bounds checking bugs on "pipe" This is similar to commit 674d9de02aa7 ("NFC: Fix possible memory corruption when handling SHDLC I-Frame commands") and commit d7ee81ad09f0 ("NFC: nci: Add some bounds checking in nci_hci_cmd_received()") which added range checks on "pipe". The "pipe" variable comes skb->data[0] in nfc_hci_msg_rx_work(). It's in the 0-255 range. We're using it as the array index into the hdev->pipes[] array which has NFC_HCI_MAX_PIPES (128) members. Fixes: 118278f20aa8 ("NFC: hci: Add pipes table to reference them with a tuple {gate, host}") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/nfc/hci/core.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 6f1b096e601c..43811b5219b5 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -181,13 +181,20 @@ exit: void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, struct sk_buff *skb) { - u8 gate = hdev->pipes[pipe].gate; u8 status = NFC_HCI_ANY_OK; struct hci_create_pipe_resp *create_info; struct hci_delete_pipe_noti *delete_info; struct hci_all_pipe_cleared_noti *cleared_info; + u8 gate; - pr_debug("from gate %x pipe %x cmd %x\n", gate, pipe, cmd); + pr_debug("from pipe %x cmd %x\n", pipe, cmd); + + if (pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + + gate = hdev->pipes[pipe].gate; switch (cmd) { case NFC_HCI_ADM_NOTIFY_PIPE_CREATED: @@ -375,8 +382,14 @@ void nfc_hci_event_received(struct nfc_hci_dev *hdev, u8 pipe, u8 event, struct sk_buff *skb) { int r = 0; - u8 gate = hdev->pipes[pipe].gate; + u8 gate; + + if (pipe >= NFC_HCI_MAX_PIPES) { + pr_err("Discarded event %x to invalid pipe %x\n", event, pipe); + goto exit; + } + gate = hdev->pipes[pipe].gate; if (gate == NFC_HCI_INVALID_GATE) { pr_err("Discarded event %x to unopened pipe %x\n", event, pipe); goto exit; -- cgit v1.2.3 From 2398e3991bda7caa6b112a6f650fbab92f732b91 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 4 Mar 2020 16:51:07 +0100 Subject: mptcp: always include dack if possible. Currently passive MPTCP socket can skip including the DACK option - if the peer sends data before accept() completes. The above happens because the msk 'can_ack' flag is set only after the accept() call. Such missing DACK option may cause - as per RFC spec - unwanted fallback to TCP. This change addresses the issue using the key material available in the current subflow, if any, to create a suitable dack option when msk ack seq is not yet available. v1 -> v2: - adavance the generated ack after the initial MPC packet Fixes: d22f4988ffec ("mptcp: process MP_CAPABLE data option") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 45acd877bef3..fd2c3150e591 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -334,6 +334,8 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, struct mptcp_sock *msk; unsigned int ack_size; bool ret = false; + bool can_ack; + u64 ack_seq; u8 tcp_fin; if (skb) { @@ -360,9 +362,22 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, ret = true; } + /* passive sockets msk will set the 'can_ack' after accept(), even + * if the first subflow may have the already the remote key handy + */ + can_ack = true; opts->ext_copy.use_ack = 0; msk = mptcp_sk(subflow->conn); - if (!msk || !READ_ONCE(msk->can_ack)) { + if (likely(msk && READ_ONCE(msk->can_ack))) { + ack_seq = msk->ack_seq; + } else if (subflow->can_ack) { + mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); + ack_seq++; + } else { + can_ack = false; + } + + if (unlikely(!can_ack)) { *size = ALIGN(dss_size, 4); return ret; } @@ -375,7 +390,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, dss_size += ack_size; - opts->ext_copy.data_ack = msk->ack_seq; + opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; opts->ext_copy.use_ack = 1; -- cgit v1.2.3 From 6a42cefb25d8bdc1b391f4a53c78c32164eea2dd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 6 Mar 2020 17:37:28 +0100 Subject: netfilter: nft_chain_nat: inet family is missing module ownership Set owner to THIS_MODULE, otherwise the nft_chain_nat module might be removed while there are still inet/nat chains in place. [ 117.942096] BUG: unable to handle page fault for address: ffffffffa0d5e040 [ 117.942101] #PF: supervisor read access in kernel mode [ 117.942103] #PF: error_code(0x0000) - not-present page [ 117.942106] PGD 200c067 P4D 200c067 PUD 200d063 PMD 3dc909067 PTE 0 [ 117.942113] Oops: 0000 [#1] PREEMPT SMP PTI [ 117.942118] CPU: 3 PID: 27 Comm: kworker/3:0 Not tainted 5.6.0-rc3+ #348 [ 117.942133] Workqueue: events nf_tables_trans_destroy_work [nf_tables] [ 117.942145] RIP: 0010:nf_tables_chain_destroy.isra.0+0x94/0x15a [nf_tables] [ 117.942149] Code: f6 45 54 01 0f 84 d1 00 00 00 80 3b 05 74 44 48 8b 75 e8 48 c7 c7 72 be de a0 e8 56 e6 2d e0 48 8b 45 e8 48 c7 c7 7f be de a0 <48> 8b 30 e8 43 e6 2d e0 48 8b 45 e8 48 8b 40 10 48 85 c0 74 5b 8b [ 117.942152] RSP: 0018:ffffc9000015be10 EFLAGS: 00010292 [ 117.942155] RAX: ffffffffa0d5e040 RBX: ffff88840be87fc2 RCX: 0000000000000007 [ 117.942158] RDX: 0000000000000007 RSI: 0000000000000086 RDI: ffffffffa0debe7f [ 117.942160] RBP: ffff888403b54b50 R08: 0000000000001482 R09: 0000000000000004 [ 117.942162] R10: 0000000000000000 R11: 0000000000000001 R12: ffff8883eda7e540 [ 117.942164] R13: dead000000000122 R14: dead000000000100 R15: ffff888403b3db80 [ 117.942167] FS: 0000000000000000(0000) GS:ffff88840e4c0000(0000) knlGS:0000000000000000 [ 117.942169] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 117.942172] CR2: ffffffffa0d5e040 CR3: 00000003e4c52002 CR4: 00000000001606e0 [ 117.942174] Call Trace: [ 117.942188] nf_tables_trans_destroy_work.cold+0xd/0x12 [nf_tables] [ 117.942196] process_one_work+0x1d6/0x3b0 [ 117.942200] worker_thread+0x45/0x3c0 [ 117.942203] ? process_one_work+0x3b0/0x3b0 [ 117.942210] kthread+0x112/0x130 [ 117.942214] ? kthread_create_worker_on_cpu+0x40/0x40 [ 117.942221] ret_from_fork+0x35/0x40 nf_tables_chain_destroy() crashes on module_put() because the module is gone. Fixes: d164385ec572 ("netfilter: nat: add inet family nat support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_chain_nat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index ff9ac8ae0031..eac4a901233f 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -89,6 +89,7 @@ static const struct nft_chain_type nft_chain_nat_inet = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_INET, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | -- cgit v1.2.3 From 17c25cafd4d3e74c83dce56b158843b19c40b414 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Mar 2020 22:05:14 -0800 Subject: gre: fix uninit-value in __iptunnel_pull_header syzbot found an interesting case of the kernel reading an uninit-value [1] Problem is in the handling of ETH_P_WCCP in gre_parse_header() We look at the byte following GRE options to eventually decide if the options are four bytes longer. Use skb_header_pointer() to not pull bytes if we found that no more bytes were needed. All callers of gre_parse_header() are properly using pskb_may_pull() anyway before proceeding to next header. [1] BUG: KMSAN: uninit-value in pskb_may_pull include/linux/skbuff.h:2303 [inline] BUG: KMSAN: uninit-value in __iptunnel_pull_header+0x30c/0xbd0 net/ipv4/ip_tunnel_core.c:94 CPU: 1 PID: 11784 Comm: syz-executor940 Not tainted 5.6.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x220 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:118 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 pskb_may_pull include/linux/skbuff.h:2303 [inline] __iptunnel_pull_header+0x30c/0xbd0 net/ipv4/ip_tunnel_core.c:94 iptunnel_pull_header include/net/ip_tunnels.h:411 [inline] gre_rcv+0x15e/0x19c0 net/ipv6/ip6_gre.c:606 ip6_protocol_deliver_rcu+0x181b/0x22c0 net/ipv6/ip6_input.c:432 ip6_input_finish net/ipv6/ip6_input.c:473 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ip6_input net/ipv6/ip6_input.c:482 [inline] ip6_mc_input+0xdf2/0x1460 net/ipv6/ip6_input.c:576 dst_input include/net/dst.h:442 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ipv6_rcv+0x683/0x710 net/ipv6/ip6_input.c:306 __netif_receive_skb_one_core net/core/dev.c:5198 [inline] __netif_receive_skb net/core/dev.c:5312 [inline] netif_receive_skb_internal net/core/dev.c:5402 [inline] netif_receive_skb+0x66b/0xf20 net/core/dev.c:5461 tun_rx_batched include/linux/skbuff.h:4321 [inline] tun_get_user+0x6aef/0x6f60 drivers/net/tun.c:1997 tun_chr_write_iter+0x1f2/0x360 drivers/net/tun.c:2026 call_write_iter include/linux/fs.h:1901 [inline] new_sync_write fs/read_write.c:483 [inline] __vfs_write+0xa5a/0xca0 fs/read_write.c:496 vfs_write+0x44a/0x8f0 fs/read_write.c:558 ksys_write+0x267/0x450 fs/read_write.c:611 __do_sys_write fs/read_write.c:623 [inline] __se_sys_write fs/read_write.c:620 [inline] __ia32_sys_write+0xdb/0x120 fs/read_write.c:620 do_syscall_32_irqs_on arch/x86/entry/common.c:339 [inline] do_fast_syscall_32+0x3c7/0x6e0 arch/x86/entry/common.c:410 entry_SYSENTER_compat+0x68/0x77 arch/x86/entry/entry_64_compat.S:139 RIP: 0023:0xf7f62d99 Code: 90 e8 0b 00 00 00 f3 90 0f ae e8 eb f9 8d 74 26 00 89 3c 24 c3 90 90 90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 002b:00000000fffedb2c EFLAGS: 00000217 ORIG_RAX: 0000000000000004 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000020002580 RDX: 0000000000000fca RSI: 0000000000000036 RDI: 0000000000000004 RBP: 0000000000008914 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:144 [inline] kmsan_internal_poison_shadow+0x66/0xd0 mm/kmsan/kmsan.c:127 kmsan_slab_alloc+0x8a/0xe0 mm/kmsan/kmsan_hooks.c:82 slab_alloc_node mm/slub.c:2793 [inline] __kmalloc_node_track_caller+0xb40/0x1200 mm/slub.c:4401 __kmalloc_reserve net/core/skbuff.c:142 [inline] __alloc_skb+0x2fd/0xac0 net/core/skbuff.c:210 alloc_skb include/linux/skbuff.h:1051 [inline] alloc_skb_with_frags+0x18c/0xa70 net/core/skbuff.c:5766 sock_alloc_send_pskb+0xada/0xc60 net/core/sock.c:2242 tun_alloc_skb drivers/net/tun.c:1529 [inline] tun_get_user+0x10ae/0x6f60 drivers/net/tun.c:1843 tun_chr_write_iter+0x1f2/0x360 drivers/net/tun.c:2026 call_write_iter include/linux/fs.h:1901 [inline] new_sync_write fs/read_write.c:483 [inline] __vfs_write+0xa5a/0xca0 fs/read_write.c:496 vfs_write+0x44a/0x8f0 fs/read_write.c:558 ksys_write+0x267/0x450 fs/read_write.c:611 __do_sys_write fs/read_write.c:623 [inline] __se_sys_write fs/read_write.c:620 [inline] __ia32_sys_write+0xdb/0x120 fs/read_write.c:620 do_syscall_32_irqs_on arch/x86/entry/common.c:339 [inline] do_fast_syscall_32+0x3c7/0x6e0 arch/x86/entry/common.c:410 entry_SYSENTER_compat+0x68/0x77 arch/x86/entry/entry_64_compat.S:139 Fixes: 95f5c64c3c13 ("gre: Move utility functions to common headers") Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 5fd6e8ed02b5..66fdbfe5447c 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -56,7 +56,9 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) } EXPORT_SYMBOL_GPL(gre_del_protocol); -/* Fills in tpi and returns header length to be pulled. */ +/* Fills in tpi and returns header length to be pulled. + * Note that caller must use pskb_may_pull() before pulling GRE header. + */ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs) { @@ -110,8 +112,14 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header */ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { + u8 _val, *val; + + val = skb_header_pointer(skb, nhs + hdr_len, + sizeof(_val), &_val); + if (!val) + return -EINVAL; tpi->proto = proto; - if ((*(u8 *)options & 0xF0) != 0x40) + if ((*val & 0xF0) != 0x40) hdr_len += 4; } tpi->hdr_len = hdr_len; -- cgit v1.2.3 From 83f73c5bb7b9a9135173f0ba2b1aa00c06664ff9 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 5 Mar 2020 15:33:12 +0300 Subject: inet_diag: return classid for all socket types In commit 1ec17dbd90f8 ("inet_diag: fix reporting cgroup classid and fallback to priority") croup classid reporting was fixed. But this works only for TCP sockets because for other socket types icsk parameter can be NULL and classid code path is skipped. This change moves classid handling to inet_diag_msg_attrs_fill() function. Also inet_diag_msg_attrs_size() helper was added and addends in nlmsg_new() were reordered to save order from inet_sk_diag_fill(). Fixes: 1ec17dbd90f8 ("inet_diag: fix reporting cgroup classid and fallback to priority") Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 18 ++++++++++++------ net/ipv4/inet_diag.c | 44 ++++++++++++++++++++------------------------ net/ipv4/raw_diag.c | 5 +++-- net/ipv4/udp_diag.c | 5 +++-- net/sctp/diag.c | 8 ++------ 5 files changed, 40 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 39faaaf843e1..c91cf2dee12a 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -2,15 +2,10 @@ #ifndef _INET_DIAG_H_ #define _INET_DIAG_H_ 1 +#include #include -struct net; -struct sock; struct inet_hashinfo; -struct nlattr; -struct nlmsghdr; -struct sk_buff; -struct netlink_callback; struct inet_diag_handler { void (*dump)(struct sk_buff *skb, @@ -62,6 +57,17 @@ int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk); +static inline size_t inet_diag_msg_attrs_size(void) +{ + return nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + + nla_total_size(1) /* INET_DIAG_TOS */ +#if IS_ENABLED(CONFIG_IPV6) + + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(1) /* INET_DIAG_SKV6ONLY */ +#endif + + nla_total_size(4) /* INET_DIAG_MARK */ + + nla_total_size(4); /* INET_DIAG_CLASS_ID */ +} int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, struct user_namespace *user_ns, bool net_admin); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f11e997e517b..8c8377568a78 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -100,13 +100,9 @@ static size_t inet_sk_attr_size(struct sock *sk, aux = handler->idiag_get_aux_size(sk, net_admin); return nla_total_size(sizeof(struct tcp_info)) - + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ - + nla_total_size(1) /* INET_DIAG_TOS */ - + nla_total_size(1) /* INET_DIAG_TCLASS */ - + nla_total_size(4) /* INET_DIAG_MARK */ - + nla_total_size(4) /* INET_DIAG_CLASS_ID */ - + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + nla_total_size(TCP_CA_NAME_MAX) + nla_total_size(sizeof(struct tcpvegas_info)) @@ -147,6 +143,24 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark)) goto errout; + if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || + ext & (1 << (INET_DIAG_TCLASS - 1))) { + u32 classid = 0; + +#ifdef CONFIG_SOCK_CGROUP_DATA + classid = sock_cgroup_classid(&sk->sk_cgrp_data); +#endif + /* Fallback to socket priority if class id isn't set. + * Classful qdiscs use it as direct reference to class. + * For cgroup2 classid is always zero. + */ + if (!classid) + classid = sk->sk_priority; + + if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) + goto errout; + } + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); @@ -284,24 +298,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto errout; } - if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || - ext & (1 << (INET_DIAG_TCLASS - 1))) { - u32 classid = 0; - -#ifdef CONFIG_SOCK_CGROUP_DATA - classid = sock_cgroup_classid(&sk->sk_cgrp_data); -#endif - /* Fallback to socket priority if class id isn't set. - * Classful qdiscs use it as direct reference to class. - * For cgroup2 classid is always zero. - */ - if (!classid) - classid = sk->sk_priority; - - if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) - goto errout; - } - out: nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index e35736b99300..a93e7d1e1251 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -100,8 +100,9 @@ static int raw_diag_dump_one(struct sk_buff *in_skb, if (IS_ERR(sk)) return PTR_ERR(sk); - rep = nlmsg_new(sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + 64, + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) { sock_put(sk); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 910555a4d9fe..dccd2286bc28 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -64,8 +64,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, goto out; err = -ENOMEM; - rep = nlmsg_new(sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + 64, + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 8a15146faaeb..1069d7af3672 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -237,15 +237,11 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc) addrcnt++; return nla_total_size(sizeof(struct sctp_info)) - + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ - + nla_total_size(1) /* INET_DIAG_TOS */ - + nla_total_size(1) /* INET_DIAG_TCLASS */ - + nla_total_size(4) /* INET_DIAG_MARK */ - + nla_total_size(4) /* INET_DIAG_CLASS_ID */ + nla_total_size(addrlen * asoc->peer.transport_count) + nla_total_size(addrlen * addrcnt) - + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64; } -- cgit v1.2.3 From 018d26fcd12a75fb9b5fe233762aa3f2f0854b88 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 5 Mar 2020 17:45:57 +0300 Subject: cgroup, netclassid: periodically release file_lock on classid updating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In our production environment we have faced with problem that updating classid in cgroup with heavy tasks cause long freeze of the file tables in this tasks. By heavy tasks we understand tasks with many threads and opened sockets (e.g. balancers). This freeze leads to an increase number of client timeouts. This patch implements following logic to fix this issue: аfter iterating 1000 file descriptors file table lock will be released thus providing a time gap for socket creation/deletion. Now update is non atomic and socket may be skipped using calls: dup2(oldfd, newfd); close(oldfd); But this case is not typical. Moreover before this patch skip is possible too by hiding socket fd in unix socket buffer. New sockets will be allocated with updated classid because cgroup state is updated before start of the file descriptors iteration. So in common cases this patch has no side effects. Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/core/netclassid_cgroup.c | 47 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 0642f91c4038..b4c87fe31be2 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -53,30 +53,60 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) kfree(css_cls_state(css)); } +/* + * To avoid freezing of sockets creation for tasks with big number of threads + * and opened sockets lets release file_lock every 1000 iterated descriptors. + * New sockets will already have been created with new classid. + */ + +struct update_classid_context { + u32 classid; + unsigned int batch; +}; + +#define UPDATE_CLASSID_BATCH 1000 + static int update_classid_sock(const void *v, struct file *file, unsigned n) { int err; + struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file, &err); if (sock) { spin_lock(&cgroup_sk_update_lock); - sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, - (unsigned long)v); + sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); spin_unlock(&cgroup_sk_update_lock); } + if (--ctx->batch == 0) { + ctx->batch = UPDATE_CLASSID_BATCH; + return n + 1; + } return 0; } +static void update_classid_task(struct task_struct *p, u32 classid) +{ + struct update_classid_context ctx = { + .classid = classid, + .batch = UPDATE_CLASSID_BATCH + }; + unsigned int fd = 0; + + do { + task_lock(p); + fd = iterate_fd(p->files, fd, update_classid_sock, &ctx); + task_unlock(p); + cond_resched(); + } while (fd); +} + static void cgrp_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct task_struct *p; cgroup_taskset_for_each(p, css, tset) { - task_lock(p); - iterate_fd(p->files, 0, update_classid_sock, - (void *)(unsigned long)css_cls_state(css)->classid); - task_unlock(p); + update_classid_task(p, css_cls_state(css)->classid); } } @@ -98,10 +128,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, css_task_iter_start(css, 0, &it); while ((p = css_task_iter_next(&it))) { - task_lock(p); - iterate_fd(p->files, 0, update_classid_sock, - (void *)(unsigned long)cs->classid); - task_unlock(p); + update_classid_task(p, cs->classid); cond_resched(); } css_task_iter_end(&it); -- cgit v1.2.3 From d752a4986532cb6305dfd5290a614cde8072769d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 9 Mar 2020 22:16:06 -0700 Subject: net: memcg: late association of sock to memcg If a TCP socket is allocated in IRQ context or cloned from unassociated (i.e. not associated to a memcg) in IRQ context then it will remain unassociated for its whole life. Almost half of the TCPs created on the system are created in IRQ context, so, memory used by such sockets will not be accounted by the memcg. This issue is more widespread in cgroup v1 where network memory accounting is opt-in but it can happen in cgroup v2 if the source socket for the cloning was created in root memcg. To fix the issue, just do the association of the sockets at the accept() time in the process context and then force charge the memory buffer already used and reserved by the socket. Signed-off-by: Shakeel Butt Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- mm/memcontrol.c | 14 -------------- net/core/sock.c | 5 ++++- net/ipv4/inet_connection_sock.c | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1cf41ee22258..2058b8da18db 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6682,20 +6682,6 @@ void mem_cgroup_sk_alloc(struct sock *sk) if (!mem_cgroup_sockets_enabled) return; - /* - * Socket cloning can throw us here with sk_memcg already - * filled. It won't however, necessarily happen from - * process context. So the test for root memcg given - * the current task's memcg won't help us in this case. - * - * Respecting the original socket's memcg is a better - * decision in this case. - */ - if (sk->sk_memcg) { - css_get(&sk->sk_memcg->css); - return; - } - /* Do not associate the sock with unrelated interrupted task's memcg. */ if (in_interrupt()) return; diff --git a/net/core/sock.c b/net/core/sock.c index a4c8fac781ff..8f71684305c3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1830,7 +1830,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); - mem_cgroup_sk_alloc(newsk); + + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; + cgroup_sk_alloc(&newsk->sk_cgrp_data); rcu_read_lock(); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a4db79b1b643..65a3b2565102 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -482,6 +482,26 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) } spin_unlock_bh(&queue->fastopenq.lock); } + + if (mem_cgroup_sockets_enabled) { + int amt; + + /* atomically get the memory usage, set and charge the + * sk->sk_memcg. + */ + lock_sock(newsk); + + /* The sk has not been accepted yet, no need to look at + * sk->sk_wmem_queued. + */ + amt = sk_mem_pages(newsk->sk_forward_alloc + + atomic_read(&sk->sk_rmem_alloc)); + mem_cgroup_sk_alloc(newsk); + if (newsk->sk_memcg && amt) + mem_cgroup_charge_skmem(newsk->sk_memcg, amt); + + release_sock(newsk); + } out: release_sock(sk); if (req) -- cgit v1.2.3 From 60380488e4e0b95e9e82aa68aa9705baa86de84c Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 10 Mar 2020 15:27:37 +0800 Subject: ipv6/addrconf: call ipv6_mc_up() for non-Ethernet interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rafał found an issue that for non-Ethernet interface, if we down and up frequently, the memory will be consumed slowly. The reason is we add allnodes/allrouters addressed in multicast list in ipv6_add_dev(). When link down, we call ipv6_mc_down(), store all multicast addresses via mld_add_delrec(). But when link up, we don't call ipv6_mc_up() for non-Ethernet interface to remove the addresses. This makes idev->mc_tomb getting bigger and bigger. The call stack looks like: addrconf_notify(NETDEV_REGISTER) ipv6_add_dev ipv6_dev_mc_inc(ff01::1) ipv6_dev_mc_inc(ff02::1) ipv6_dev_mc_inc(ff02::2) addrconf_notify(NETDEV_UP) addrconf_dev_config /* Alas, we support only Ethernet autoconfiguration. */ return; addrconf_notify(NETDEV_DOWN) addrconf_ifdown ipv6_mc_down igmp6_group_dropped(ff02::2) mld_add_delrec(ff02::2) igmp6_group_dropped(ff02::1) igmp6_group_dropped(ff01::1) After investigating, I can't found a rule to disable multicast on non-Ethernet interface. In RFC2460, the link could be Ethernet, PPP, ATM, tunnels, etc. In IPv4, it doesn't check the dev type when calls ip_mc_up() in inetdev_event(). Even for IPv6, we don't check the dev type and call ipv6_add_dev(), ipv6_dev_mc_inc() after register device. So I think it's OK to fix this memory consumer by calling ipv6_mc_up() for non-Ethernet interface. v2: Also check IFF_MULTICAST flag to make sure the interface supports multicast Reported-by: Rafał Miłecki Tested-by: Rafał Miłecki Fixes: 74235a25c673 ("[IPV6] addrconf: Fix IPv6 on tuntap tunnels") Fixes: 1666d49e1d41 ("mld: do not remove mld souce list info when set link down") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e6e1290ea06f..46d614b611db 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3347,6 +3347,10 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_NONE) && (dev->type != ARPHRD_RAWIP)) { /* Alas, we support only Ethernet autoconfiguration. */ + idev = __in6_dev_get(dev); + if (!IS_ERR_OR_NULL(idev) && dev->flags & IFF_UP && + dev->flags & IFF_MULTICAST) + ipv6_mc_up(idev); return; } -- cgit v1.2.3 From ece0d7bd74615773268475b6b64d6f1ebbd4b4c6 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 10 Mar 2020 09:33:30 +0100 Subject: net/smc: cancel event worker during device removal During IB device removal, cancel the event worker before the device structure is freed. Fixes: a4cf0443c414 ("smc: introduce SMC as an IB-client") Reported-by: syzbot+b297c6825752e7a07272@syzkaller.appspotmail.com Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Reviewed-by: Leon Romanovsky Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d6ba186f67e2..05b825b3cfa4 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -582,6 +582,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) smc_smcr_terminate_all(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); + cancel_work_sync(&smcibdev->port_event_work); kfree(smcibdev); } -- cgit v1.2.3 From 0e1a1d853ecedc99da9d27f9f5c376935547a0e2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:10:56 -0800 Subject: nl80211: add missing attribute validation for critical protocol indication Add missing attribute validation for critical protocol fields to the netlink policy. Fixes: 5de17984898c ("cfg80211: introduce critical protocol indication from user-space") Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20200303051058.4089398-2-kuba@kernel.org Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5b19e9fac4aa..cd0e024d7cb6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -531,6 +531,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MDID] = { .type = NLA_U16 }, [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_CRIT_PROT_ID] = { .type = NLA_U16 }, + [NL80211_ATTR_MAX_CRIT_PROT_DURATION] = { .type = NLA_U16 }, [NL80211_ATTR_PEER_AID] = NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID), [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, -- cgit v1.2.3 From 056e9375e1f3c4bf2fd49b70258c7daf788ecd9d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:10:57 -0800 Subject: nl80211: add missing attribute validation for beacon report scanning Add missing attribute validation for beacon report scanning to the netlink policy. Fixes: 1d76250bd34a ("nl80211: support beacon report scanning") Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20200303051058.4089398-3-kuba@kernel.org Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cd0e024d7cb6..48e6508aba52 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -470,6 +470,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED }, [NL80211_ATTR_STA_PLINK_STATE] = NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_STATES - 1), + [NL80211_ATTR_MEASUREMENT_DURATION] = { .type = NLA_U16 }, + [NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY] = { .type = NLA_FLAG }, [NL80211_ATTR_MESH_PEER_AID] = NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID), [NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 }, -- cgit v1.2.3 From 5cde05c61cbe13cbb3fa66d52b9ae84f7975e5e6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Mar 2020 21:10:58 -0800 Subject: nl80211: add missing attribute validation for channel switch Add missing attribute validation for NL80211_ATTR_OPER_CLASS to the netlink policy. Fixes: 1057d35ede5d ("cfg80211: introduce TDLS channel switch commands") Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20200303051058.4089398-4-kuba@kernel.org Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 48e6508aba52..ec5d67794aab 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -565,6 +565,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_UPS - 1), [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, + [NL80211_ATTR_OPER_CLASS] = { .type = NLA_U8 }, [NL80211_ATTR_MAC_MASK] = { .type = NLA_EXACT_LEN_WARN, .len = ETH_ALEN -- cgit v1.2.3 From ba32679cac50c38fdf488296f96b1f3175532b8e Mon Sep 17 00:00:00 2001 From: Nicolas Cavallari Date: Thu, 5 Mar 2020 15:04:09 +0100 Subject: mac80211: Do not send mesh HWMP PREQ if HWMP is disabled When trying to transmit to an unknown destination, the mesh code would unconditionally transmit a HWMP PREQ even if HWMP is not the current path selection algorithm. Signed-off-by: Nicolas Cavallari Link: https://lore.kernel.org/r/20200305140409.12204-1-cavallar@lri.fr Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index d69983370381..38a0383dfbcf 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1152,7 +1152,8 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, } } - if (!(mpath->flags & MESH_PATH_RESOLVING)) + if (!(mpath->flags & MESH_PATH_RESOLVING) && + mesh_path_sel_is_hwmp(sdata)) mesh_queue_preq(mpath, PREQ_Q_F_START); if (skb_queue_len(&mpath->frame_queue) >= MESH_FRAME_QUEUE_LEN) -- cgit v1.2.3 From f9fc28a8de2fb367f1ab76f0cf176ca545db3d6f Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Thu, 12 Mar 2020 11:04:20 +0530 Subject: net: caif: Add lockdep expression to RCU traversal primitive caifdevs->list is traversed using list_for_each_entry_rcu() outside an RCU read-side critical section but under the protection of rtnl_mutex. Hence, add the corresponding lockdep expression to silence the following false-positive warning: [ 10.868467] ============================= [ 10.869082] WARNING: suspicious RCU usage [ 10.869817] 5.6.0-rc1-00177-g06ec0a154aae4 #1 Not tainted [ 10.870804] ----------------------------- [ 10.871557] net/caif/caif_dev.c:115 RCU-list traversed in non-reader section!! Reported-by: kernel test robot Signed-off-by: Amol Grover Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 03c7cdd8e4cb..195d2d67be8a 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -112,7 +112,8 @@ static struct caif_device_entry *caif_get(struct net_device *dev) caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; - list_for_each_entry_rcu(caifd, &caifdevs->list, list) { + list_for_each_entry_rcu(caifd, &caifdevs->list, list, + lockdep_rtnl_is_held()) { if (caifd->netdev == dev) return caifd; } -- cgit v1.2.3 From 46e4c421a053c36bf7a33dda2272481bcaf3eed3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 9 Mar 2020 11:34:35 -0400 Subject: net/packet: tpacket_rcv: do not increment ring index on drop In one error case, tpacket_rcv drops packets after incrementing the ring producer index. If this happens, it does not update tp_status to TP_STATUS_USER and thus the reader is stalled for an iteration of the ring, causing out of order arrival. The only such error path is when virtio_net_hdr_from_skb fails due to encountering an unknown GSO type. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 30c6879d6774..e5b0986215d2 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2274,6 +2274,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) goto drop_n_account; + + if (do_vnet && + virtio_net_hdr_from_skb(skb, h.raw + macoff - + sizeof(struct virtio_net_hdr), + vio_le(), true, 0)) + goto drop_n_account; + if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* @@ -2286,12 +2293,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, status |= TP_STATUS_LOSING; } - if (do_vnet && - virtio_net_hdr_from_skb(skb, h.raw + macoff - - sizeof(struct virtio_net_hdr), - vio_le(), true, 0)) - goto drop_n_account; - po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; -- cgit v1.2.3 From a20f997010c4ec76eaa55b8cc047d76dcac69f70 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 11 Mar 2020 16:24:24 +0100 Subject: net: dsa: Don't instantiate phylink for CPU/DSA ports unless needed By default, DSA drivers should configure CPU and DSA ports to their maximum speed. In many configurations this is sufficient to make the link work. In some cases it is necessary to configure the link to run slower, e.g. because of limitations of the SoC it is connected to. Or back to back PHYs are used and the PHY needs to be driven in order to establish link. In this case, phylink is used. Only instantiate phylink if it is required. If there is no PHY, or no fixed link properties, phylink can upset a link which works in the default configuration. Fixes: 0e27921816ad ("net: dsa: Use PHYLINK for the CPU/DSA ports") Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/port.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/port.c b/net/dsa/port.c index ed7dabb57985..ec13dc666788 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -648,9 +648,14 @@ err_phy_connect: int dsa_port_link_register_of(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; + struct device_node *phy_np; - if (!ds->ops->adjust_link) - return dsa_port_phylink_register(dp); + if (!ds->ops->adjust_link) { + phy_np = of_parse_phandle(dp->dn, "phy-handle", 0); + if (of_phy_is_fixed_link(dp->dn) || phy_np) + return dsa_port_phylink_register(dp); + return 0; + } dev_warn(ds->dev, "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n"); @@ -665,11 +670,12 @@ void dsa_port_link_unregister_of(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; - if (!ds->ops->adjust_link) { + if (!ds->ops->adjust_link && dp->pl) { rtnl_lock(); phylink_disconnect_phy(dp->pl); rtnl_unlock(); phylink_destroy(dp->pl); + dp->pl = NULL; return; } -- cgit v1.2.3 From 2677625387056136e256c743e3285b4fe3da87bb Mon Sep 17 00:00:00 2001 From: Paolo Lungaroni Date: Wed, 11 Mar 2020 17:54:06 +0100 Subject: seg6: fix SRv6 L2 tunnels to use IANA-assigned protocol number The Internet Assigned Numbers Authority (IANA) has recently assigned a protocol number value of 143 for Ethernet [1]. Before this assignment, encapsulation mechanisms such as Segment Routing used the IPv6-NoNxt protocol number (59) to indicate that the encapsulated payload is an Ethernet frame. In this patch, we add the definition of the Ethernet protocol number to the kernel headers and update the SRv6 L2 tunnels to use it. [1] https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml Signed-off-by: Paolo Lungaroni Reviewed-by: Andrea Mayer Acked-by: Ahmed Abdelsalam Signed-off-by: David S. Miller --- include/uapi/linux/in.h | 2 ++ net/ipv6/seg6_iptunnel.c | 2 +- net/ipv6/seg6_local.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 1521073b6348..8533bf07450f 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -74,6 +74,8 @@ enum { #define IPPROTO_UDPLITE IPPROTO_UDPLITE IPPROTO_MPLS = 137, /* MPLS in IP (RFC 4023) */ #define IPPROTO_MPLS IPPROTO_MPLS + IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */ +#define IPPROTO_ETHERNET IPPROTO_ETHERNET IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW IPPROTO_MPTCP = 262, /* Multipath TCP connection */ diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index ab7f124ff5d7..8c52efe299cc 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -268,7 +268,7 @@ static int seg6_do_srh(struct sk_buff *skb) skb_mac_header_rebuild(skb); skb_push(skb, skb->mac_len); - err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE); + err = seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET); if (err) return err; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 7cbc19731997..8165802d8e05 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -282,7 +282,7 @@ static int input_action_end_dx2(struct sk_buff *skb, struct net_device *odev; struct ethhdr *eth; - if (!decap_and_validate(skb, NEXTHDR_NONE)) + if (!decap_and_validate(skb, IPPROTO_ETHERNET)) goto drop; if (!pskb_may_pull(skb, ETH_HLEN)) -- cgit v1.2.3 From 06669ea346e476a5339033d77ef175566a40efbb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Mar 2020 11:44:26 -0700 Subject: net: memcg: fix lockdep splat in inet_csk_accept() Locking newsk while still holding the listener lock triggered a lockdep splat [1] We can simply move the memcg code after we release the listener lock, as this can also help if multiple threads are sharing a common listener. Also fix a typo while reading socket sk_rmem_alloc. [1] WARNING: possible recursive locking detected 5.6.0-rc3-syzkaller #0 Not tainted -------------------------------------------- syz-executor598/9524 is trying to acquire lock: ffff88808b5b8b90 (sk_lock-AF_INET6){+.+.}, at: lock_sock include/net/sock.h:1541 [inline] ffff88808b5b8b90 (sk_lock-AF_INET6){+.+.}, at: inet_csk_accept+0x69f/0xd30 net/ipv4/inet_connection_sock.c:492 but task is already holding lock: ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: lock_sock include/net/sock.h:1541 [inline] ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: inet_csk_accept+0x8d/0xd30 net/ipv4/inet_connection_sock.c:445 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(sk_lock-AF_INET6); lock(sk_lock-AF_INET6); *** DEADLOCK *** May be due to missing lock nesting notation 1 lock held by syz-executor598/9524: #0: ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: lock_sock include/net/sock.h:1541 [inline] #0: ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: inet_csk_accept+0x8d/0xd30 net/ipv4/inet_connection_sock.c:445 stack backtrace: CPU: 0 PID: 9524 Comm: syz-executor598 Not tainted 5.6.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x188/0x20d lib/dump_stack.c:118 print_deadlock_bug kernel/locking/lockdep.c:2370 [inline] check_deadlock kernel/locking/lockdep.c:2411 [inline] validate_chain kernel/locking/lockdep.c:2954 [inline] __lock_acquire.cold+0x114/0x288 kernel/locking/lockdep.c:3954 lock_acquire+0x197/0x420 kernel/locking/lockdep.c:4484 lock_sock_nested+0xc5/0x110 net/core/sock.c:2947 lock_sock include/net/sock.h:1541 [inline] inet_csk_accept+0x69f/0xd30 net/ipv4/inet_connection_sock.c:492 inet_accept+0xe9/0x7c0 net/ipv4/af_inet.c:734 __sys_accept4_file+0x3ac/0x5b0 net/socket.c:1758 __sys_accept4+0x53/0x90 net/socket.c:1809 __do_sys_accept4 net/socket.c:1821 [inline] __se_sys_accept4 net/socket.c:1818 [inline] __x64_sys_accept4+0x93/0xf0 net/socket.c:1818 do_syscall_64+0xf6/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4445c9 Code: e8 0c 0d 03 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffc35b37608 EFLAGS: 00000246 ORIG_RAX: 0000000000000120 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004445c9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000306777 R09: 0000000000306777 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00000000004053d0 R14: 0000000000000000 R15: 0000000000000000 Fixes: d752a4986532 ("net: memcg: late association of sock to memcg") Signed-off-by: Eric Dumazet Cc: Shakeel Butt Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 65a3b2565102..d545fb99a8a1 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -483,27 +483,27 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) spin_unlock_bh(&queue->fastopenq.lock); } - if (mem_cgroup_sockets_enabled) { +out: + release_sock(sk); + if (newsk && mem_cgroup_sockets_enabled) { int amt; /* atomically get the memory usage, set and charge the - * sk->sk_memcg. + * newsk->sk_memcg. */ lock_sock(newsk); - /* The sk has not been accepted yet, no need to look at - * sk->sk_wmem_queued. + /* The socket has not been accepted yet, no need to look at + * newsk->sk_wmem_queued. */ amt = sk_mem_pages(newsk->sk_forward_alloc + - atomic_read(&sk->sk_rmem_alloc)); + atomic_read(&newsk->sk_rmem_alloc)); mem_cgroup_sk_alloc(newsk); if (newsk->sk_memcg && amt) mem_cgroup_charge_skmem(newsk->sk_memcg, amt); release_sock(newsk); } -out: - release_sock(sk); if (req) reqsk_put(req); return newsk; -- cgit v1.2.3 From b09fe70ef520e011ba4a64f4b93f948a8f14717b Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 9 Mar 2020 10:39:53 -0700 Subject: taprio: Fix sending packets without dequeueing them There was a bug that was causing packets to be sent to the driver without first calling dequeue() on the "child" qdisc. And the KASAN report below shows that sending a packet without calling dequeue() leads to bad results. The problem is that when checking the last qdisc "child" we do not set the returned skb to NULL, which can cause it to be sent to the driver, and so after the skb is sent, it may be freed, and in some situations a reference to it may still be in the child qdisc, because it was never dequeued. The crash log looks like this: [ 19.937538] ================================================================== [ 19.938300] BUG: KASAN: use-after-free in taprio_dequeue_soft+0x620/0x780 [ 19.938968] Read of size 4 at addr ffff8881128628cc by task swapper/1/0 [ 19.939612] [ 19.939772] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.6.0-rc3+ #97 [ 19.940397] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qe4 [ 19.941523] Call Trace: [ 19.941774] [ 19.941985] dump_stack+0x97/0xe0 [ 19.942323] print_address_description.constprop.0+0x3b/0x60 [ 19.942884] ? taprio_dequeue_soft+0x620/0x780 [ 19.943325] ? taprio_dequeue_soft+0x620/0x780 [ 19.943767] __kasan_report.cold+0x1a/0x32 [ 19.944173] ? taprio_dequeue_soft+0x620/0x780 [ 19.944612] kasan_report+0xe/0x20 [ 19.944954] taprio_dequeue_soft+0x620/0x780 [ 19.945380] __qdisc_run+0x164/0x18d0 [ 19.945749] net_tx_action+0x2c4/0x730 [ 19.946124] __do_softirq+0x268/0x7bc [ 19.946491] irq_exit+0x17d/0x1b0 [ 19.946824] smp_apic_timer_interrupt+0xeb/0x380 [ 19.947280] apic_timer_interrupt+0xf/0x20 [ 19.947687] [ 19.947912] RIP: 0010:default_idle+0x2d/0x2d0 [ 19.948345] Code: 00 00 41 56 41 55 65 44 8b 2d 3f 8d 7c 7c 41 54 55 53 0f 1f 44 00 00 e8 b1 b2 c5 fd e9 07 00 3 [ 19.950166] RSP: 0018:ffff88811a3efda0 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13 [ 19.950909] RAX: 0000000080000000 RBX: ffff88811a3a9600 RCX: ffffffff8385327e [ 19.951608] RDX: 1ffff110234752c0 RSI: 0000000000000000 RDI: ffffffff8385262f [ 19.952309] RBP: ffffed10234752c0 R08: 0000000000000001 R09: ffffed10234752c1 [ 19.953009] R10: ffffed10234752c0 R11: ffff88811a3a9607 R12: 0000000000000001 [ 19.953709] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000 [ 19.954408] ? default_idle_call+0x2e/0x70 [ 19.954816] ? default_idle+0x1f/0x2d0 [ 19.955192] default_idle_call+0x5e/0x70 [ 19.955584] do_idle+0x3d4/0x500 [ 19.955909] ? arch_cpu_idle_exit+0x40/0x40 [ 19.956325] ? _raw_spin_unlock_irqrestore+0x23/0x30 [ 19.956829] ? trace_hardirqs_on+0x30/0x160 [ 19.957242] cpu_startup_entry+0x19/0x20 [ 19.957633] start_secondary+0x2a6/0x380 [ 19.958026] ? set_cpu_sibling_map+0x18b0/0x18b0 [ 19.958486] secondary_startup_64+0xa4/0xb0 [ 19.958921] [ 19.959078] Allocated by task 33: [ 19.959412] save_stack+0x1b/0x80 [ 19.959747] __kasan_kmalloc.constprop.0+0xc2/0xd0 [ 19.960222] kmem_cache_alloc+0xe4/0x230 [ 19.960617] __alloc_skb+0x91/0x510 [ 19.960967] ndisc_alloc_skb+0x133/0x330 [ 19.961358] ndisc_send_ns+0x134/0x810 [ 19.961735] addrconf_dad_work+0xad5/0xf80 [ 19.962144] process_one_work+0x78e/0x13a0 [ 19.962551] worker_thread+0x8f/0xfa0 [ 19.962919] kthread+0x2ba/0x3b0 [ 19.963242] ret_from_fork+0x3a/0x50 [ 19.963596] [ 19.963753] Freed by task 33: [ 19.964055] save_stack+0x1b/0x80 [ 19.964386] __kasan_slab_free+0x12f/0x180 [ 19.964830] kmem_cache_free+0x80/0x290 [ 19.965231] ip6_mc_input+0x38a/0x4d0 [ 19.965617] ipv6_rcv+0x1a4/0x1d0 [ 19.965948] __netif_receive_skb_one_core+0xf2/0x180 [ 19.966437] netif_receive_skb+0x8c/0x3c0 [ 19.966846] br_handle_frame_finish+0x779/0x1310 [ 19.967302] br_handle_frame+0x42a/0x830 [ 19.967694] __netif_receive_skb_core+0xf0e/0x2a90 [ 19.968167] __netif_receive_skb_one_core+0x96/0x180 [ 19.968658] process_backlog+0x198/0x650 [ 19.969047] net_rx_action+0x2fa/0xaa0 [ 19.969420] __do_softirq+0x268/0x7bc [ 19.969785] [ 19.969940] The buggy address belongs to the object at ffff888112862840 [ 19.969940] which belongs to the cache skbuff_head_cache of size 224 [ 19.971202] The buggy address is located 140 bytes inside of [ 19.971202] 224-byte region [ffff888112862840, ffff888112862920) [ 19.972344] The buggy address belongs to the page: [ 19.972820] page:ffffea00044a1800 refcount:1 mapcount:0 mapping:ffff88811a2bd1c0 index:0xffff8881128625c0 compo0 [ 19.973930] flags: 0x8000000000010200(slab|head) [ 19.974388] raw: 8000000000010200 ffff88811a2ed650 ffff88811a2ed650 ffff88811a2bd1c0 [ 19.975151] raw: ffff8881128625c0 0000000000190013 00000001ffffffff 0000000000000000 [ 19.975915] page dumped because: kasan: bad access detected [ 19.976461] page_owner tracks the page as allocated [ 19.976946] page last allocated via order 2, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NO) [ 19.978332] prep_new_page+0x24b/0x330 [ 19.978707] get_page_from_freelist+0x2057/0x2c90 [ 19.979170] __alloc_pages_nodemask+0x218/0x590 [ 19.979619] new_slab+0x9d/0x300 [ 19.979948] ___slab_alloc.constprop.0+0x2f9/0x6f0 [ 19.980421] __slab_alloc.constprop.0+0x30/0x60 [ 19.980870] kmem_cache_alloc+0x201/0x230 [ 19.981269] __alloc_skb+0x91/0x510 [ 19.981620] alloc_skb_with_frags+0x78/0x4a0 [ 19.982043] sock_alloc_send_pskb+0x5eb/0x750 [ 19.982476] unix_stream_sendmsg+0x399/0x7f0 [ 19.982904] sock_sendmsg+0xe2/0x110 [ 19.983262] ____sys_sendmsg+0x4de/0x6d0 [ 19.983660] ___sys_sendmsg+0xe4/0x160 [ 19.984032] __sys_sendmsg+0xab/0x130 [ 19.984396] do_syscall_64+0xe7/0xae0 [ 19.984761] page last free stack trace: [ 19.985142] __free_pages_ok+0x432/0xbc0 [ 19.985533] qlist_free_all+0x56/0xc0 [ 19.985907] quarantine_reduce+0x149/0x170 [ 19.986315] __kasan_kmalloc.constprop.0+0x9e/0xd0 [ 19.986791] kmem_cache_alloc+0xe4/0x230 [ 19.987182] prepare_creds+0x24/0x440 [ 19.987548] do_faccessat+0x80/0x590 [ 19.987906] do_syscall_64+0xe7/0xae0 [ 19.988276] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 19.988775] [ 19.988930] Memory state around the buggy address: [ 19.989402] ffff888112862780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 19.990111] ffff888112862800: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb [ 19.990822] >ffff888112862880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 19.991529] ^ [ 19.992081] ffff888112862900: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc [ 19.992796] ffff888112862980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc Fixes: 5a781ccbd19e ("tc: Add support for configuring the taprio scheduler") Reported-by: Michael Schmidt Signed-off-by: Vinicius Costa Gomes Acked-by: Andre Guedes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index ee717e05372b..b1eb12d33b9a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -564,8 +564,10 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); - if (!(gate_mask & BIT(tc))) + if (!(gate_mask & BIT(tc))) { + skb = NULL; continue; + } len = qdisc_pkt_len(skb); guard = ktime_add_ns(taprio_get_time(q), @@ -575,13 +577,17 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) * guard band ... */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && - ktime_after(guard, entry->close_time)) + ktime_after(guard, entry->close_time)) { + skb = NULL; continue; + } /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && - atomic_sub_return(len, &entry->budget) < 0) + atomic_sub_return(len, &entry->budget) < 0) { + skb = NULL; continue; + } skb = child->ops->dequeue(child); if (unlikely(!skb)) -- cgit v1.2.3