summaryrefslogtreecommitdiffstats
path: root/net/netfilter
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-05-07 10:32:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-05-07 10:32:03 -0700
commitfcee7d82f27d6a8b1ddc5bbefda59b4e441e9bc0 (patch)
treefc6254372916832b89cb60f94464d41a48b2f045 /net/netfilter
parent19cbc75c56c0ed4fa3f637e3c41a98895a68dfae (diff)
parent41ae14071cd7f6a7770e2fe1f8a0859d4c2c6ba4 (diff)
downloadlinux-fcee7d82f27d6a8b1ddc5bbefda59b4e441e9bc0.tar.gz
linux-fcee7d82f27d6a8b1ddc5bbefda59b4e441e9bc0.zip
Merge tag 'net-7.1-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Pull networking fixes from Jakub Kicinski: "Including fixes from Netfilter, IPsec, Bluetooth and WiFi. Current release - fix to a fix: - ipmr: add __rcu to netns_ipv4.mrt, make sure we hold the RCU lock in all relevant places Current release - new code bugs: - fixes for the recently added resizable hash tables - ipv6: make sure we default IPv6 tunnel drivers to =m now that IPv6 itself is built in - drv: octeontx2-af: fixes for parser/CAM fixes Previous releases - regressions: - phy: micrel: fix LAN8814 QSGMII soft reset - wifi: - cw1200: revert "Fix locking in error paths" - ath12k: fix crash on WCN7850, due to adding the same queue buffer to a list multiple times Previous releases - always broken: - number of info leak fixes - ipv6: implement limits on extension header parsing - wifi: number of fixes for missing bound checks in the drivers - Bluetooth: fixes for races and locking issues - af_unix: - fix an issue between garbage collection and PEEK - fix yet another issue with OOB data - xfrm: esp: avoid in-place decrypt on shared skb frags - netfilter: replace skb_try_make_writable() by skb_ensure_writable() - openvswitch: vport: fix race between tunnel creation and linking leading to invalid memory accesses (type confusion) - drv: amd-xgbe: fix PTP addend overflow causing frozen clock Misc: - sched/isolation: make HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN (for relevant IPVS change)" * tag 'net-7.1-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (190 commits) net: sparx5: configure serdes for 1000BASE-X in sparx5_port_init() net: sparx5: fix wrong chip ids for TSN SKUs net: stmmac: dwmac-nuvoton: fix NULL pointer dereference in nvt_set_phy_intf_sel() tcp: Fix dst leak in tcp_v6_connect(). ipmr: Call ipmr_fib_lookup() under RCU. net: phy: broadcom: Save PHY counters during suspend net/smc: fix missing sk_err when TCP handshake fails af_unix: Reject SIOCATMARK on non-stream sockets veth: fix OOB txq access in veth_poll() with asymmetric queue counts eth: fbnic: fix double-free of PCS on phylink creation failure net: ethernet: cortina: Drop half-assembled SKB selftests: mptcp: pm: restrict 'unknown' check to pm_nl_ctl selftests: mptcp: check output: catch cmd errors mptcp: pm: prio: skip closed subflows mptcp: pm: ADD_ADDR rtx: return early if no retrans mptcp: pm: ADD_ADDR rtx: skip inactive subflows mptcp: pm: ADD_ADDR rtx: resched blocked ADD_ADDR quicker mptcp: pm: ADD_ADDR rtx: free sk if last mptcp: pm: ADD_ADDR rtx: always decrease sk refcount mptcp: pm: ADD_ADDR rtx: fix potential data-race ...
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c76
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c164
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c83
-rw-r--r--net/netfilter/nf_dup_netdev.c16
-rw-r--r--net/netfilter/nf_flow_table_core.c1
-rw-r--r--net/netfilter/nf_flow_table_ip.c151
-rw-r--r--net/netfilter/nf_flow_table_path.c7
-rw-r--r--net/netfilter/nf_tables_api.c35
-rw-r--r--net/netfilter/nf_tables_core.c2
-rw-r--r--net/netfilter/nft_compat.c45
-rw-r--r--net/netfilter/nft_exthdr.c2
-rw-r--r--net/netfilter/nft_fwd_netdev.c29
-rw-r--r--net/netfilter/nft_osf.c2
-rw-r--r--net/netfilter/nft_tproxy.c8
-rw-r--r--net/netfilter/x_tables.c79
-rw-r--r--net/netfilter/xt_CT.c8
-rw-r--r--net/netfilter/xt_TCPMSS.c33
-rw-r--r--net/netfilter/xt_TPROXY.c11
-rw-r--r--net/netfilter/xt_addrtype.c25
-rw-r--r--net/netfilter/xt_devgroup.c18
-rw-r--r--net/netfilter/xt_ecn.c4
-rw-r--r--net/netfilter/xt_hashlimit.c4
-rw-r--r--net/netfilter/xt_osf.c3
-rw-r--r--net/netfilter/xt_physdev.c20
-rw-r--r--net/netfilter/xt_policy.c24
-rw-r--r--net/netfilter/xt_set.c39
-rw-r--r--net/netfilter/xt_tcpmss.c4
28 files changed, 627 insertions, 268 deletions
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 2082bfb2d93c..9ea6b4fa78bf 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -267,27 +267,20 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
hash_key2 = hash_key;
use2 = false;
}
+
conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */,
&head, &head2);
- spin_lock(&cp->lock);
-
- if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- cp->flags |= IP_VS_CONN_F_HASHED;
- WRITE_ONCE(cp->hn0.hash_key, hash_key);
- WRITE_ONCE(cp->hn1.hash_key, hash_key2);
- refcount_inc(&cp->refcnt);
- hlist_bl_add_head_rcu(&cp->hn0.node, head);
- if (use2)
- hlist_bl_add_head_rcu(&cp->hn1.node, head2);
- ret = 1;
- } else {
- pr_err("%s(): request for already hashed, called from %pS\n",
- __func__, __builtin_return_address(0));
- ret = 0;
- }
- spin_unlock(&cp->lock);
+ cp->flags |= IP_VS_CONN_F_HASHED;
+ WRITE_ONCE(cp->hn0.hash_key, hash_key);
+ WRITE_ONCE(cp->hn1.hash_key, hash_key2);
+ refcount_inc(&cp->refcnt);
+ hlist_bl_add_head_rcu(&cp->hn0.node, head);
+ if (use2)
+ hlist_bl_add_head_rcu(&cp->hn1.node, head2);
+
conn_tab_unlock(head, head2);
+ ret = 1;
/* Schedule resizing if load increases */
if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
@@ -321,7 +314,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */,
&head, &head2);
- spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_HASHED) {
/* Decrease refcnt and unlink conn only if we are last user */
@@ -334,7 +326,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
}
}
- spin_unlock(&cp->lock);
conn_tab_unlock(head, head2);
rcu_read_unlock();
@@ -637,6 +628,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
struct ip_vs_conn_hnode *hn;
u32 hash_key, hash_key_new;
struct ip_vs_conn_param p;
+ bool by_me = false;
int ntbl;
int dir;
@@ -664,8 +656,16 @@ retry:
t = rcu_dereference(t->new_tbl);
ntbl++;
/* We are lost? */
- if (ntbl >= 2)
+ if (ntbl >= 2) {
+ spin_lock_bh(&cp->lock);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT && by_me)
+ cp->cport = 0;
+ /* hn1 will be rehashed on next packet */
+ spin_unlock_bh(&cp->lock);
+ IP_VS_ERR_RL("%s(): Too many ht changes for dir %d\n",
+ __func__, dir);
return;
+ }
}
/* Rehashing during resize? Use the recent table for adds */
@@ -683,10 +683,13 @@ retry:
if (head > head2 && t == t2)
swap(head, head2);
+ /* Protect the cp->flags modification */
+ spin_lock_bh(&cp->lock);
+
/* Lock seqcount only for the old bucket, even if we are on new table
* because it affects the del operation, not the adding.
*/
- spin_lock_bh(&t->lock[hash_key & t->lock_mask].l);
+ spin_lock(&t->lock[hash_key & t->lock_mask].l);
preempt_disable_nested();
write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]);
@@ -704,14 +707,23 @@ retry:
hlist_bl_unlock(head);
write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
preempt_enable_nested();
- spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
+ spin_unlock(&t->lock[hash_key & t->lock_mask].l);
+ spin_unlock_bh(&cp->lock);
hash_key = hash_key_new;
goto retry;
}
- spin_lock(&cp->lock);
- if ((cp->flags & IP_VS_CONN_F_NO_CPORT) &&
- (cp->flags & IP_VS_CONN_F_HASHED)) {
+ /* Fill cport once, even if multiple packets try to do it */
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT && (!cp->cport || by_me)) {
+ /* If we race with resizing make sure cport is set for dir 1 */
+ if (!cp->cport) {
+ cp->cport = cport;
+ by_me = true;
+ }
+ if (!dir) {
+ atomic_dec(&ipvs->no_cport_conns[af_id]);
+ cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+ }
/* We do not recalc hash_key_r under lock, we assume the
* parameters in cp do not change, i.e. cport is
* the only possible change.
@@ -726,21 +738,17 @@ retry:
hlist_bl_del_rcu(&hn->node);
hlist_bl_add_head_rcu(&hn->node, head_new);
}
- if (!dir) {
- atomic_dec(&ipvs->no_cport_conns[af_id]);
- cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
- cp->cport = cport;
- }
}
- spin_unlock(&cp->lock);
if (head != head2)
hlist_bl_unlock(head2);
hlist_bl_unlock(head);
write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
preempt_enable_nested();
- spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
- if (dir--)
+ spin_unlock(&t->lock[hash_key & t->lock_mask].l);
+
+ spin_unlock_bh(&cp->lock);
+ if (dir-- && by_me)
goto next_dir;
}
@@ -1835,7 +1843,7 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
if (!rcu_dereference_protected(ipvs->conn_tab, 1))
return;
- cancel_delayed_work_sync(&ipvs->conn_resize_work);
+ disable_delayed_work_sync(&ipvs->conn_resize_work);
if (!atomic_read(&ipvs->conn_count))
goto unreg;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index f5b7a2047291..d40b404c1bf6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -237,7 +237,7 @@ int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
{
if (!t)
return 1 << min_bits;
- n = roundup_pow_of_two(n);
+ n = n > 0 ? roundup_pow_of_two(n) : 1;
if (lfactor < 0) {
int factor = min(-lfactor, max_bits);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 6632daa87ded..c7c7f6a7a9f6 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -261,12 +261,28 @@ static void est_reload_work_handler(struct work_struct *work)
if (!kd)
continue;
/* New config ? Stop kthread tasks */
- if (genid != genid_done)
- ip_vs_est_kthread_stop(kd);
+ if (genid != genid_done) {
+ if (!id) {
+ /* Only we can stop kt 0 but not under mutex */
+ mutex_unlock(&ipvs->est_mutex);
+ ip_vs_est_kthread_stop(kd);
+ mutex_lock(&ipvs->est_mutex);
+ if (!READ_ONCE(ipvs->enable))
+ goto unlock;
+ /* kd for kt 0 is never destroyed */
+ } else {
+ ip_vs_est_kthread_stop(kd);
+ }
+ }
if (!kd->task && !ip_vs_est_stopped(ipvs)) {
+ bool start;
+
/* Do not start kthreads above 0 in calc phase */
- if ((!id || !ipvs->est_calc_phase) &&
- ip_vs_est_kthread_start(ipvs, kd) < 0)
+ if (id)
+ start = !ipvs->est_calc_phase;
+ else
+ start = kd->needed;
+ if (start && ip_vs_est_kthread_start(ipvs, kd) < 0)
repeat = true;
}
}
@@ -1102,6 +1118,24 @@ out:
return dest;
}
+/* Put destination in trash */
+static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs,
+ struct ip_vs_dest *dest, unsigned long istart,
+ bool cleanup)
+{
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ refcount_read(&dest->refcnt));
+ if (list_empty(&ipvs->dest_trash) && !cleanup)
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ /* dest lives in trash with reference */
+ list_add(&dest->t_list, &ipvs->dest_trash);
+ dest->idle_start = istart;
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+}
+
static void ip_vs_dest_rcu_free(struct rcu_head *head)
{
struct ip_vs_dest *dest;
@@ -1461,9 +1495,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ntohs(dest->vport));
ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ /* On error put back dest into the trash */
if (ret < 0)
- return ret;
- __ip_vs_update_dest(svc, dest, udest, 1);
+ ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start,
+ false);
+ else
+ __ip_vs_update_dest(svc, dest, udest, 1);
} else {
/*
* Allocate and initialize the dest structure
@@ -1533,17 +1570,7 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
*/
ip_vs_rs_unhash(dest);
- spin_lock_bh(&ipvs->dest_trash_lock);
- IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
- IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
- refcount_read(&dest->refcnt));
- if (list_empty(&ipvs->dest_trash) && !cleanup)
- mod_timer(&ipvs->dest_trash_timer,
- jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
- /* dest lives in trash with reference */
- list_add(&dest->t_list, &ipvs->dest_trash);
- dest->idle_start = 0;
- spin_unlock_bh(&ipvs->dest_trash_lock);
+ ip_vs_trash_put_dest(ipvs, dest, 0, cleanup);
/* Queue up delayed work to expire all no destination connections.
* No-op when CONFIG_SYSCTL is disabled.
@@ -1812,11 +1839,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
*svc_p = svc;
if (!READ_ONCE(ipvs->enable)) {
+ mutex_lock(&ipvs->est_mutex);
+
/* Now there is a service - full throttle */
WRITE_ONCE(ipvs->enable, 1);
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+
/* Start estimation for first time */
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
+ mutex_unlock(&ipvs->est_mutex);
}
return 0;
@@ -2032,6 +2064,9 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
cancel_delayed_work_sync(&ipvs->svc_resize_work);
if (t) {
rcu_assign_pointer(ipvs->svc_table, NULL);
+ /* Inform readers that table is removed */
+ smp_mb__before_atomic();
+ atomic_inc(&ipvs->svc_table_changes);
while (1) {
p = rcu_dereference_protected(t->new_tbl, 1);
call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
@@ -2078,6 +2113,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
t = rcu_dereference_protected(ipvs->svc_table, 1);
if (t) {
rcu_assign_pointer(ipvs->svc_table, NULL);
+ /* Inform readers that table is removed */
+ smp_mb__before_atomic();
+ atomic_inc(&ipvs->svc_table_changes);
while (1) {
p = rcu_dereference_protected(t->new_tbl, 1);
call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
@@ -2086,6 +2124,11 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
t = p;
}
}
+ /* Stop the tot_stats estimator early under service_mutex
+ * to avoid locking it again later.
+ */
+ if (cleanup)
+ ip_vs_stop_estimator_tot_stats(ipvs);
return 0;
}
@@ -2331,7 +2374,7 @@ static int ipvs_proc_est_cpumask_set(const struct ctl_table *table,
/* est_max_threads may depend on cpulist size */
ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
ipvs->est_calc_phase = 1;
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
unlock:
mutex_unlock(&ipvs->est_mutex);
@@ -2351,11 +2394,14 @@ static int ipvs_proc_est_cpumask_get(const struct ctl_table *table,
mutex_lock(&ipvs->est_mutex);
- if (ipvs->est_cpulist_valid)
- mask = *valp;
- else
- mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
- ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+ /* HK_TYPE_KTHREAD cpumask needs RCU protection */
+ scoped_guard(rcu) {
+ if (ipvs->est_cpulist_valid)
+ mask = *valp;
+ else
+ mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+ ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+ }
mutex_unlock(&ipvs->est_mutex);
@@ -2411,7 +2457,7 @@ static int ipvs_proc_est_nice(const struct ctl_table *table, int write,
mutex_lock(&ipvs->est_mutex);
if (*valp != val) {
*valp = val;
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
}
mutex_unlock(&ipvs->est_mutex);
}
@@ -2438,7 +2484,7 @@ static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
mutex_lock(&ipvs->est_mutex);
if (*valp != val) {
*valp = val;
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
}
mutex_unlock(&ipvs->est_mutex);
}
@@ -2463,7 +2509,7 @@ static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write,
if (val < -8 || val > 8) {
ret = -EINVAL;
} else {
- *valp = val;
+ WRITE_ONCE(*valp, val);
if (rcu_access_pointer(ipvs->conn_tab))
mod_delayed_work(system_unbound_wq,
&ipvs->conn_resize_work, 0);
@@ -2490,10 +2536,16 @@ static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write,
if (val < -8 || val > 8) {
ret = -EINVAL;
} else {
- *valp = val;
- if (rcu_access_pointer(ipvs->svc_table))
+ mutex_lock(&ipvs->service_mutex);
+ WRITE_ONCE(*valp, val);
+ /* Make sure the services are present */
+ if (rcu_access_pointer(ipvs->svc_table) &&
+ READ_ONCE(ipvs->enable) &&
+ !test_bit(IP_VS_WORK_SVC_NORESIZE,
+ &ipvs->work_flags))
mod_delayed_work(system_unbound_wq,
&ipvs->svc_resize_work, 0);
+ mutex_unlock(&ipvs->service_mutex);
}
}
return ret;
@@ -3004,7 +3056,8 @@ static int ip_vs_status_show(struct seq_file *seq, void *v)
int old_gen, new_gen;
u32 counts[8];
u32 bucket;
- int count;
+ u32 count;
+ int loops;
u32 sum1;
u32 sum;
int i;
@@ -3020,6 +3073,7 @@ static int ip_vs_status_show(struct seq_file *seq, void *v)
if (!atomic_read(&ipvs->conn_count))
goto after_conns;
old_gen = atomic_read(&ipvs->conn_tab_changes);
+ loops = 0;
repeat_conn:
smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
@@ -3032,8 +3086,11 @@ repeat_conn:
resched_score++;
ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
count = 0;
- hlist_bl_for_each_entry_rcu(hn, e, head, node)
+ hlist_bl_for_each_entry_rcu(hn, e, head, node) {
count++;
+ if (count >= ARRAY_SIZE(counts) - 1)
+ break;
+ }
}
resched_score += count;
if (resched_score >= 100) {
@@ -3042,37 +3099,41 @@ repeat_conn:
new_gen = atomic_read(&ipvs->conn_tab_changes);
/* New table installed ? */
if (old_gen != new_gen) {
+ /* Too many changes? */
+ if (++loops >= 5)
+ goto after_conns;
old_gen = new_gen;
goto repeat_conn;
}
}
- counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
+ counts[count]++;
}
}
for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
sum += counts[i];
sum1 = sum - counts[0];
- seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n",
- counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
+ seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n",
+ counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U)));
for (i = 1; i < ARRAY_SIZE(counts); i++) {
if (!counts[i])
continue;
- seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n",
+ seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n",
i, counts[i],
- (unsigned long)counts[i] * 100 / max(sum1, 1U));
+ div_u64((u64)counts[i] * 100U, max(sum1, 1U)));
}
after_conns:
t = rcu_dereference(ipvs->svc_table);
count = ip_vs_get_num_services(ipvs);
- seq_printf(seq, "Services:\t%d\n", count);
+ seq_printf(seq, "Services:\t%u\n", count);
seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n",
t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
if (!count)
goto after_svc;
old_gen = atomic_read(&ipvs->svc_table_changes);
+ loops = 0;
repeat_svc:
smp_rmb(); /* ipvs->svc_table and svc_table_changes */
@@ -3086,8 +3147,11 @@ repeat_svc:
ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
count = 0;
hlist_bl_for_each_entry_rcu(svc, e, head,
- s_list)
+ s_list) {
count++;
+ if (count >= ARRAY_SIZE(counts) - 1)
+ break;
+ }
}
resched_score += count;
if (resched_score >= 100) {
@@ -3096,24 +3160,27 @@ repeat_svc:
new_gen = atomic_read(&ipvs->svc_table_changes);
/* New table installed ? */
if (old_gen != new_gen) {
+ /* Too many changes? */
+ if (++loops >= 5)
+ goto after_svc;
old_gen = new_gen;
goto repeat_svc;
}
}
- counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
+ counts[count]++;
}
}
for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
sum += counts[i];
sum1 = sum - counts[0];
- seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n",
- counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
+ seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n",
+ counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U)));
for (i = 1; i < ARRAY_SIZE(counts); i++) {
if (!counts[i])
continue;
- seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n",
+ seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n",
i, counts[i],
- (unsigned long)counts[i] * 100 / max(sum1, 1U));
+ div_u64((u64)counts[i] * 100U, max(sum1, 1U)));
}
after_svc:
@@ -4967,7 +5034,14 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
- ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+ if (ipvs->tot_stats->s.est.ktid != -2) {
+ /* Not stopped yet? This happens only on netns init error and
+ * we even do not need to lock the service_mutex for this case.
+ */
+ mutex_lock(&ipvs->service_mutex);
+ ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+ mutex_unlock(&ipvs->service_mutex);
+ }
if (ipvs->est_cpulist_valid)
free_cpumask_var(ipvs->sysctl_est_cpulist);
@@ -5039,7 +5113,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
ipvs->net->proc_net,
ip_vs_stats_percpu_show, NULL))
goto err_percpu;
- if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net,
+ if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net,
ip_vs_status_show, NULL))
goto err_status;
#endif
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 433ba3cab58c..ab09f5182951 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -68,6 +68,11 @@
and the limit of estimators per kthread
- est_add_ktid: ktid where to add new ests, can point to empty slot where
we should add kt data
+ - data protected by service_mutex: est_temp_list, est_add_ktid,
+ est_kt_count(R/W), est_kt_arr(R/W), est_genid_done, kd->needed(R/W)
+ - data protected by est_mutex: est_genid, est_max_threads, sysctl_est_cpulist,
+ est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation,
+ est_kt_count(R), est_kt_arr(R), kd->needed(R), kd->task (id > 0)
*/
static struct lock_class_key __ipvs_est_key;
@@ -227,14 +232,17 @@ static int ip_vs_estimation_kthread(void *data)
}
/* Schedule stop/start for kthread tasks */
-void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart)
{
+ lockdep_assert_held(&ipvs->est_mutex);
+
/* Ignore reloads before first service is added */
if (!READ_ONCE(ipvs->enable))
return;
ip_vs_est_stopped_recalc(ipvs);
- /* Bump the kthread configuration genid */
- atomic_inc(&ipvs->est_genid);
+ /* Bump the kthread configuration genid if stopping is requested */
+ if (restart)
+ atomic_inc(&ipvs->est_genid);
queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
}
@@ -304,12 +312,17 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
void *arr = NULL;
int i;
- if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
- READ_ONCE(ipvs->enable) && ipvs->est_max_threads)
- return -EINVAL;
-
mutex_lock(&ipvs->est_mutex);
+ /* Allow kt 0 data to be created before the services are added
+ * and limit the kthreads when services are present.
+ */
+ if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
+ READ_ONCE(ipvs->enable) && ipvs->est_max_threads) {
+ ret = -EINVAL;
+ goto out;
+ }
+
for (i = 0; i < id; i++) {
if (!ipvs->est_kt_arr[i])
break;
@@ -333,6 +346,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
kd->est_timer = jiffies;
kd->id = id;
ip_vs_est_set_params(ipvs, kd);
+ kd->needed = 1;
/* Pre-allocate stats used in calc phase */
if (!id && !kd->calc_stats) {
@@ -341,12 +355,8 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
goto out;
}
- /* Start kthread tasks only when services are present */
- if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) {
- ret = ip_vs_est_kthread_start(ipvs, kd);
- if (ret < 0)
- goto out;
- }
+ /* Request kthread to be started */
+ ip_vs_est_reload_start(ipvs, false);
if (arr)
ipvs->est_kt_count++;
@@ -482,12 +492,11 @@ out:
/* Start estimation for stats */
int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
+ struct ip_vs_est_kt_data *kd = ipvs->est_kt_count > 0 ?
+ ipvs->est_kt_arr[0] : NULL;
struct ip_vs_estimator *est = &stats->est;
int ret;
- if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable))
- ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
-
est->ktid = -1;
est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */
@@ -496,8 +505,15 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
* will not allocate much memory, just for kt 0.
*/
ret = 0;
- if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
+ if (!kd) {
ret = ip_vs_est_add_kthread(ipvs);
+ } else if (!kd->needed) {
+ mutex_lock(&ipvs->est_mutex);
+ /* We have job for the kt 0 task */
+ kd->needed = 1;
+ ip_vs_est_reload_start(ipvs, true);
+ mutex_unlock(&ipvs->est_mutex);
+ }
if (ret >= 0)
hlist_add_head(&est->list, &ipvs->est_temp_list);
else
@@ -578,16 +594,14 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
}
end_kt0:
- /* kt 0 is freed after all other kthreads and chains are empty */
+ /* kt 0 task is stopped after all other kt slots and chains are empty */
if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
kd = ipvs->est_kt_arr[0];
- if (!kd || !kd->est_count) {
+ if (kd && !kd->est_count) {
mutex_lock(&ipvs->est_mutex);
- if (kd) {
- ip_vs_est_kthread_destroy(kd);
- ipvs->est_kt_arr[0] = NULL;
- }
- ipvs->est_kt_count--;
+ /* Keep the kt0 data but request kthread_stop */
+ kd->needed = 0;
+ ip_vs_est_reload_start(ipvs, true);
mutex_unlock(&ipvs->est_mutex);
ipvs->est_add_ktid = 0;
}
@@ -647,9 +661,9 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
u64 val;
INIT_HLIST_HEAD(&chain);
- mutex_lock(&ipvs->service_mutex);
+ mutex_lock(&ipvs->est_mutex);
kd = ipvs->est_kt_arr[0];
- mutex_unlock(&ipvs->service_mutex);
+ mutex_unlock(&ipvs->est_mutex);
s = kd ? kd->calc_stats : NULL;
if (!s)
goto out;
@@ -748,16 +762,16 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
if (!ip_vs_est_calc_limits(ipvs, &chain_max))
return;
- mutex_lock(&ipvs->service_mutex);
-
/* Stop all other tasks, so that we can immediately move the
* estimators to est_temp_list without RCU grace period
*/
mutex_lock(&ipvs->est_mutex);
for (id = 1; id < ipvs->est_kt_count; id++) {
/* netns clean up started, abort */
- if (!READ_ONCE(ipvs->enable))
- goto unlock2;
+ if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) {
+ mutex_unlock(&ipvs->est_mutex);
+ return;
+ }
kd = ipvs->est_kt_arr[id];
if (!kd)
continue;
@@ -765,9 +779,11 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
}
mutex_unlock(&ipvs->est_mutex);
+ mutex_lock(&ipvs->service_mutex);
+
/* Move all estimators to est_temp_list but carefully,
* all estimators and kthread data can be released while
- * we reschedule. Even for kthread 0.
+ * we reschedule.
*/
step = 0;
@@ -849,9 +865,7 @@ walk_chain:
ip_vs_stop_estimator(ipvs, stats);
/* Tasks are stopped, move without RCU grace period */
est->ktid = -1;
- est->ktrow = row - kd->est_row;
- if (est->ktrow < 0)
- est->ktrow += IPVS_EST_NTICKS;
+ est->ktrow = delay;
hlist_add_head(&est->list, &ipvs->est_temp_list);
/* kd freed ? */
if (last)
@@ -889,7 +903,6 @@ end_dequeue:
if (genid == atomic_read(&ipvs->est_genid))
ipvs->est_calc_phase = 0;
-unlock2:
mutex_unlock(&ipvs->est_mutex);
unlock:
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index e348fb90b8dc..3b0a70e154cd 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -13,22 +13,6 @@
#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
-#define NF_RECURSION_LIMIT 2
-
-#ifndef CONFIG_PREEMPT_RT
-static u8 *nf_get_nf_dup_skb_recursion(void)
-{
- return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
-}
-#else
-
-static u8 *nf_get_nf_dup_skb_recursion(void)
-{
- return &current->net_xmit.nf_dup_skb_recursion;
-}
-
-#endif
-
static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
enum nf_dev_hooks hook)
{
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 2c4140e6f53c..785d8c244a77 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -122,6 +122,7 @@ static int flow_offload_fill_route(struct flow_offload *flow,
flow_tuple->tun = route->tuple[dir].in.tun;
flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+ flow_tuple->needs_gso_segment = route->tuple[dir].out.needs_gso_segment;
flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
switch (route->tuple[dir].xmit_type) {
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index fd56d663cb5b..9c05a50d6013 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -445,13 +445,13 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx,
switch (skb->protocol) {
case htons(ETH_P_8021Q):
vlan_hdr = (struct vlan_hdr *)skb->data;
- __skb_pull(skb, VLAN_HLEN);
+ skb_pull_rcsum(skb, VLAN_HLEN);
vlan_set_encap_proto(skb, vlan_hdr);
skb_reset_network_header(skb);
break;
case htons(ETH_P_PPP_SES):
skb->protocol = __nf_flow_pppoe_proto(skb);
- skb_pull(skb, PPPOE_SES_HLEN);
+ skb_pull_rcsum(skb, PPPOE_SES_HLEN);
skb_reset_network_header(skb);
break;
}
@@ -462,23 +462,6 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx,
nf_flow_ip_tunnel_pop(ctx, skb);
}
-struct nf_flow_xmit {
- const void *dest;
- const void *source;
- struct net_device *outdev;
-};
-
-static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
- struct nf_flow_xmit *xmit)
-{
- skb->dev = xmit->outdev;
- dev_hard_header(skb, skb->dev, ntohs(skb->protocol),
- xmit->dest, xmit->source, skb->len);
- dev_queue_xmit(skb);
-
- return NF_STOLEN;
-}
-
static struct flow_offload_tuple_rhash *
nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx,
struct nf_flowtable *flow_table, struct sk_buff *skb)
@@ -524,7 +507,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
return 0;
}
- if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ if (skb_ensure_writable(skb, thoff + ctx->hdrsize))
return -1;
flow_offload_refresh(flow_table, flow, false);
@@ -544,7 +527,34 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
return 1;
}
-static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id)
+/* Similar to skb_vlan_push. */
+static int nf_flow_vlan_push(struct sk_buff *skb, __be16 proto, u16 id,
+ u32 needed_headroom)
+{
+ if (skb_vlan_tag_present(skb)) {
+ struct vlan_hdr *vhdr;
+
+ if (skb_cow_head(skb, needed_headroom + VLAN_HLEN))
+ return -1;
+
+ __skb_push(skb, VLAN_HLEN);
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header -= VLAN_HLEN;
+
+ vhdr = (struct vlan_hdr *)skb->data;
+ skb->network_header -= VLAN_HLEN;
+ vhdr->h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+ vhdr->h_vlan_encapsulated_proto = skb->protocol;
+ skb->protocol = skb->vlan_proto;
+ skb_postpush_rcsum(skb, skb->data, VLAN_HLEN);
+ }
+ __vlan_hwaccel_put_tag(skb, proto, id);
+
+ return 0;
+}
+
+static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id,
+ u32 needed_headroom)
{
int data_len = skb->len + sizeof(__be16);
struct ppp_hdr {
@@ -553,7 +563,7 @@ static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id)
} *ph;
__be16 proto;
- if (skb_cow_head(skb, PPPOE_SES_HLEN))
+ if (skb_cow_head(skb, needed_headroom + PPPOE_SES_HLEN))
return -1;
switch (skb->protocol) {
@@ -730,21 +740,24 @@ static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb,
}
static int nf_flow_encap_push(struct sk_buff *skb,
- struct flow_offload_tuple *tuple)
+ struct flow_offload_tuple *tuple,
+ struct net_device *outdev)
{
+ u32 needed_headroom = LL_RESERVED_SPACE(outdev);
int i;
- for (i = 0; i < tuple->encap_num; i++) {
+ for (i = tuple->encap_num - 1; i >= 0; i--) {
switch (tuple->encap[i].proto) {
case htons(ETH_P_8021Q):
case htons(ETH_P_8021AD):
- skb_reset_mac_header(skb);
- if (skb_vlan_push(skb, tuple->encap[i].proto,
- tuple->encap[i].id) < 0)
+ if (nf_flow_vlan_push(skb, tuple->encap[i].proto,
+ tuple->encap[i].id,
+ needed_headroom) < 0)
return -1;
break;
case htons(ETH_P_PPP_SES):
- if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0)
+ if (nf_flow_pppoe_push(skb, tuple->encap[i].id,
+ needed_headroom) < 0)
return -1;
break;
}
@@ -753,6 +766,76 @@ static int nf_flow_encap_push(struct sk_buff *skb,
return 0;
}
+struct nf_flow_xmit {
+ const void *dest;
+ const void *source;
+ struct net_device *outdev;
+ struct flow_offload_tuple *tuple;
+ bool needs_gso_segment;
+};
+
+static void __nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ struct net_device *dev = xmit->outdev;
+ unsigned int hh_len = LL_RESERVED_SPACE(dev);
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
+ return;
+ }
+
+ skb->dev = dev;
+ dev_hard_header(skb, dev, ntohs(skb->protocol),
+ xmit->dest, xmit->source, skb->len);
+ dev_queue_xmit(skb);
+}
+
+static unsigned int nf_flow_encap_gso_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ struct sk_buff *segs, *nskb;
+
+ segs = skb_gso_segment(skb, 0);
+ if (IS_ERR(segs))
+ return NF_DROP;
+
+ if (segs)
+ consume_skb(skb);
+ else
+ segs = skb;
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ skb_mark_not_on_list(segs);
+
+ if (nf_flow_encap_push(segs, xmit->tuple, xmit->outdev) < 0) {
+ kfree_skb(segs);
+ kfree_skb_list(nskb);
+ return NF_STOLEN;
+ }
+ __nf_flow_queue_xmit(net, segs, xmit);
+ }
+
+ return NF_STOLEN;
+}
+
+static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ if (xmit->tuple->encap_num) {
+ if (skb_is_gso(skb) && xmit->needs_gso_segment)
+ return nf_flow_encap_gso_xmit(net, skb, xmit);
+
+ if (nf_flow_encap_push(skb, xmit->tuple, xmit->outdev) < 0)
+ return NF_DROP;
+ }
+
+ __nf_flow_queue_xmit(net, skb, xmit);
+
+ return NF_STOLEN;
+}
+
unsigned int
nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -797,9 +880,6 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0)
return NF_DROP;
- if (nf_flow_encap_push(skb, other_tuple) < 0)
- return NF_DROP;
-
switch (tuplehash->tuple.xmit_type) {
case FLOW_OFFLOAD_XMIT_NEIGH:
rt = dst_rtable(tuplehash->tuple.dst_cache);
@@ -829,6 +909,8 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
WARN_ON_ONCE(1);
return NF_DROP;
}
+ xmit.tuple = other_tuple;
+ xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment;
return nf_flow_queue_xmit(state->net, skb, &xmit);
}
@@ -1037,7 +1119,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
return 0;
}
- if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ if (skb_ensure_writable(skb, thoff + ctx->hdrsize))
return -1;
flow_offload_refresh(flow_table, flow, false);
@@ -1119,9 +1201,6 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
&ip6_daddr, encap_limit) < 0)
return NF_DROP;
- if (nf_flow_encap_push(skb, other_tuple) < 0)
- return NF_DROP;
-
switch (tuplehash->tuple.xmit_type) {
case FLOW_OFFLOAD_XMIT_NEIGH:
rt = dst_rt6_info(tuplehash->tuple.dst_cache);
@@ -1151,6 +1230,8 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
WARN_ON_ONCE(1);
return NF_DROP;
}
+ xmit.tuple = other_tuple;
+ xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment;
return nf_flow_queue_xmit(state->net, skb, &xmit);
}
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
index 6bb9579dcc2a..9e88ea6a2eef 100644
--- a/net/netfilter/nf_flow_table_path.c
+++ b/net/netfilter/nf_flow_table_path.c
@@ -86,6 +86,7 @@ struct nft_forward_info {
u8 ingress_vlans;
u8 h_source[ETH_ALEN];
u8 h_dest[ETH_ALEN];
+ bool needs_gso_segment;
enum flow_offload_xmit_type xmit_type;
};
@@ -138,8 +139,11 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
path->encap.proto;
info->num_encaps++;
}
- if (path->type == DEV_PATH_PPPOE)
+ if (path->type == DEV_PATH_PPPOE) {
memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+ info->needs_gso_segment = 1;
+ }
break;
case DEV_PATH_BRIDGE:
if (is_zero_ether_addr(info->h_source))
@@ -279,6 +283,7 @@ static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
route->tuple[dir].xmit_type = info.xmit_type;
}
+ route->tuple[dir].out.needs_gso_segment = info.needs_gso_segment;
}
int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d20ce5c36d31..87387adbca65 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -407,6 +407,7 @@ static void nft_netdev_unregister_trans_hook(struct net *net,
}
static void nft_netdev_unregister_hooks(struct net *net,
+ const struct nft_table *table,
struct list_head *hook_list,
bool release_netdev)
{
@@ -414,8 +415,10 @@ static void nft_netdev_unregister_hooks(struct net *net,
struct nf_hook_ops *ops;
list_for_each_entry_safe(hook, next, hook_list, list) {
- list_for_each_entry(ops, &hook->ops_list, list)
- nf_unregister_net_hook(net, ops);
+ if (!(table->flags & NFT_TABLE_F_DORMANT)) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nf_unregister_net_hook(net, ops);
+ }
if (release_netdev)
nft_netdev_hook_unlink_free_rcu(hook);
}
@@ -452,20 +455,25 @@ static void __nf_tables_unregister_hook(struct net *net,
struct nft_base_chain *basechain;
const struct nf_hook_ops *ops;
- if (table->flags & NFT_TABLE_F_DORMANT ||
- !nft_is_base_chain(chain))
+ if (!nft_is_base_chain(chain))
return;
basechain = nft_base_chain(chain);
ops = &basechain->ops;
+ /* must also be called for dormant tables */
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
+ nft_netdev_unregister_hooks(net, table, &basechain->hook_list,
+ release_netdev);
+ return;
+ }
+
+ if (table->flags & NFT_TABLE_F_DORMANT)
+ return;
+
if (basechain->type->ops_unregister)
return basechain->type->ops_unregister(net, ops);
- if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
- nft_netdev_unregister_hooks(net, &basechain->hook_list,
- release_netdev);
- else
- nf_unregister_net_hook(net, &basechain->ops);
+ nf_unregister_net_hook(net, &basechain->ops);
}
static void nf_tables_unregister_hook(struct net *net,
@@ -4205,6 +4213,7 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
struct nft_chain *chain;
struct nft_ctx ctx = {
.net = net,
+ .table = (struct nft_table *)table,
.family = table->family,
};
int err = 0;
@@ -11281,11 +11290,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
break;
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans)) {
- if (!(table->flags & NFT_TABLE_F_DORMANT)) {
- nft_netdev_unregister_hooks(net,
- &nft_trans_chain_hooks(trans),
- true);
- }
+ nft_netdev_unregister_hooks(net, table,
+ &nft_trans_chain_hooks(trans),
+ true);
free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
nft_trans_destroy(trans);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 5ddd5b6e135f..8ab186f86dd4 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -153,7 +153,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
ptr = skb_network_header(skb) + pkt->nhoff;
else {
- if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
return false;
ptr = skb->data + nft_thoff(pkt);
}
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index decc725a33c2..0caa9304d2d0 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -261,10 +261,10 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return ret;
}
- nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
-
nft_compat_wait_for_destructors(ctx->net);
+ nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+
ret = xt_check_target(&par, size, proto, inv);
if (ret < 0) {
if (ret == -ENOENT) {
@@ -353,8 +353,6 @@ nla_put_failure:
static int nft_target_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
- struct xt_target *target = expr->ops->data;
- unsigned int hook_mask = 0;
int ret;
if (ctx->family != NFPROTO_IPV4 &&
@@ -377,11 +375,21 @@ static int nft_target_validate(const struct nft_ctx *ctx,
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
const struct nf_hook_ops *ops = &basechain->ops;
+ unsigned int hook_mask = 1 << ops->hooknum;
+ struct xt_target *target = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+ struct xt_tgchk_param par;
+ union nft_entry e = {};
- hook_mask = 1 << ops->hooknum;
if (target->hooks && !(hook_mask & target->hooks))
return -EINVAL;
+ nft_target_set_tgchk_param(&par, ctx, target, info, &e, 0, false);
+
+ ret = xt_check_hooks_target(&par);
+ if (ret < 0)
+ return ret;
+
ret = nft_compat_chain_validate_dependency(ctx, target->table);
if (ret < 0)
return ret;
@@ -515,10 +523,10 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return ret;
}
- nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
-
nft_compat_wait_for_destructors(ctx->net);
+ nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+
return xt_check_match(&par, size, proto, inv);
}
@@ -614,8 +622,6 @@ static int nft_match_large_dump(struct sk_buff *skb,
static int nft_match_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
- struct xt_match *match = expr->ops->data;
- unsigned int hook_mask = 0;
int ret;
if (ctx->family != NFPROTO_IPV4 &&
@@ -638,11 +644,30 @@ static int nft_match_validate(const struct nft_ctx *ctx,
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
const struct nf_hook_ops *ops = &basechain->ops;
+ unsigned int hook_mask = 1 << ops->hooknum;
+ struct xt_match *match = expr->ops->data;
+ size_t size = XT_ALIGN(match->matchsize);
+ struct xt_mtchk_param par;
+ union nft_entry e = {};
+ void *info;
- hook_mask = 1 << ops->hooknum;
if (match->hooks && !(hook_mask & match->hooks))
return -EINVAL;
+ if (NFT_EXPR_SIZE(size) > NFT_MATCH_LARGE_THRESH) {
+ struct nft_xt_match_priv *priv = nft_expr_priv(expr);
+
+ info = priv->info;
+ } else {
+ info = nft_expr_priv(expr);
+ }
+
+ nft_match_set_mtchk_param(&par, ctx, match, info, &e, 0, false);
+
+ ret = xt_check_hooks_match(&par);
+ if (ret < 0)
+ return ret;
+
ret = nft_compat_chain_validate_dependency(ctx, match->table);
if (ret < 0)
return ret;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 0407d6f708ae..e6a07c0df207 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -376,7 +376,7 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
const struct sctp_chunkhdr *sch;
struct sctp_chunkhdr _sch;
- if (pkt->tprot != IPPROTO_SCTP)
+ if (pkt->tprot != IPPROTO_SCTP || pkt->fragoff)
goto err;
do {
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 4bce36c3a6a0..b9e88d7cf308 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -95,12 +95,15 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
+ u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
struct nft_fwd_neigh *priv = nft_expr_priv(expr);
void *addr = &regs->data[priv->sreg_addr];
int oif = regs->data[priv->sreg_dev];
unsigned int verdict = NF_STOLEN;
struct sk_buff *skb = pkt->skb;
+ int nhoff = skb_network_offset(skb);
struct net_device *dev;
+ unsigned int hh_len;
int neigh_table;
switch (priv->nfproto) {
@@ -111,7 +114,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
verdict = NFT_BREAK;
goto out;
}
- if (skb_try_make_writable(skb, sizeof(*iph))) {
+ if (skb_ensure_writable(skb, nhoff + sizeof(*iph))) {
verdict = NF_DROP;
goto out;
}
@@ -132,7 +135,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
verdict = NFT_BREAK;
goto out;
}
- if (skb_try_make_writable(skb, sizeof(*ip6h))) {
+ if (skb_ensure_writable(skb, nhoff + sizeof(*ip6h))) {
verdict = NF_DROP;
goto out;
}
@@ -151,13 +154,31 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
goto out;
}
+ if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) {
+ verdict = NF_DROP;
+ goto out;
+ }
+
dev = dev_get_by_index_rcu(nft_net(pkt), oif);
- if (dev == NULL)
- return;
+ if (dev == NULL) {
+ verdict = NF_DROP;
+ goto out;
+ }
+
+ hh_len = LL_RESERVED_SPACE(dev);
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb) {
+ verdict = NF_STOLEN;
+ goto out;
+ }
+ }
skb->dev = dev;
skb_clear_tstamp(skb);
+ (*nf_dup_skb_recursion)++;
neigh_xmit(neigh_table, dev, addr, skb);
+ (*nf_dup_skb_recursion)--;
out:
regs->verdict.code = verdict;
}
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index c02d5cb52143..45fe56da5044 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -33,7 +33,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
return;
}
- if (pkt->tprot != IPPROTO_TCP) {
+ if (pkt->tprot != IPPROTO_TCP || pkt->fragoff) {
regs->verdict.code = NFT_BREAK;
return;
}
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index f2101af8c867..89be443734f6 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -30,8 +30,8 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
__be16 tport = 0;
struct sock *sk;
- if (pkt->tprot != IPPROTO_TCP &&
- pkt->tprot != IPPROTO_UDP) {
+ if ((pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) || pkt->fragoff) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -97,8 +97,8 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
memset(&taddr, 0, sizeof(taddr));
- if (pkt->tprot != IPPROTO_TCP &&
- pkt->tprot != IPPROTO_UDP) {
+ if ((pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) || pkt->fragoff) {
regs->verdict.code = NFT_BREAK;
return;
}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 9f837fb5ceb4..2c67c2e6b132 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -477,11 +477,9 @@ int xt_check_proc_name(const char *name, unsigned int size)
}
EXPORT_SYMBOL(xt_check_proc_name);
-int xt_check_match(struct xt_mtchk_param *par,
- unsigned int size, u16 proto, bool inv_proto)
+static int xt_check_match_common(struct xt_mtchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
{
- int ret;
-
if (XT_ALIGN(par->match->matchsize) != size &&
par->match->matchsize != -1) {
/*
@@ -530,6 +528,14 @@ int xt_check_match(struct xt_mtchk_param *par,
par->match->proto);
return -EINVAL;
}
+
+ return 0;
+}
+
+static int xt_checkentry_match(struct xt_mtchk_param *par)
+{
+ int ret;
+
if (par->match->checkentry != NULL) {
ret = par->match->checkentry(par);
if (ret < 0)
@@ -538,8 +544,34 @@ int xt_check_match(struct xt_mtchk_param *par,
/* Flag up potential errors. */
return -EIO;
}
+
+ return 0;
+}
+
+int xt_check_hooks_match(struct xt_mtchk_param *par)
+{
+ if (par->match->check_hooks != NULL)
+ return par->match->check_hooks(par);
+
return 0;
}
+EXPORT_SYMBOL_GPL(xt_check_hooks_match);
+
+int xt_check_match(struct xt_mtchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
+{
+ int ret;
+
+ ret = xt_check_match_common(par, size, proto, inv_proto);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_check_hooks_match(par);
+ if (ret < 0)
+ return ret;
+
+ return xt_checkentry_match(par);
+}
EXPORT_SYMBOL_GPL(xt_check_match);
/** xt_check_entry_match - check that matches end before start of target
@@ -1012,11 +1044,9 @@ bool xt_find_jump_offset(const unsigned int *offsets,
}
EXPORT_SYMBOL(xt_find_jump_offset);
-int xt_check_target(struct xt_tgchk_param *par,
- unsigned int size, u16 proto, bool inv_proto)
+static int xt_check_target_common(struct xt_tgchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
{
- int ret;
-
if (XT_ALIGN(par->target->targetsize) != size) {
pr_err_ratelimited("%s_tables: %s.%u target: invalid size %u (kernel) != (user) %u\n",
xt_prefix[par->family], par->target->name,
@@ -1061,6 +1091,23 @@ int xt_check_target(struct xt_tgchk_param *par,
par->target->proto);
return -EINVAL;
}
+
+ return 0;
+}
+
+int xt_check_hooks_target(struct xt_tgchk_param *par)
+{
+ if (par->target->check_hooks != NULL)
+ return par->target->check_hooks(par);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xt_check_hooks_target);
+
+static int xt_checkentry_target(struct xt_tgchk_param *par)
+{
+ int ret;
+
if (par->target->checkentry != NULL) {
ret = par->target->checkentry(par);
if (ret < 0)
@@ -1071,6 +1118,22 @@ int xt_check_target(struct xt_tgchk_param *par,
}
return 0;
}
+
+int xt_check_target(struct xt_tgchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
+{
+ int ret;
+
+ ret = xt_check_target_common(par, size, proto, inv_proto);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_check_hooks_target(par);
+ if (ret < 0)
+ return ret;
+
+ return xt_checkentry_target(par);
+}
EXPORT_SYMBOL_GPL(xt_check_target);
/**
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 498f5871c84a..d2aeacf94230 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -354,7 +354,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.revision = 1,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v1,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -366,7 +366,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.revision = 2,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v2,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -398,7 +398,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.revision = 1,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v1,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -410,7 +410,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.revision = 2,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v2,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 116a885adb3c..80e1634bc51f 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -247,6 +247,21 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
}
#endif
+static int tcpmss_tg4_check_hooks(const struct xt_tgchk_param *par)
+{
+ const struct xt_tcpmss_info *info = par->targinfo;
+
+ if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
+ (par->hook_mask & ~((1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING))) != 0) {
+ pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* Must specify -p tcp --syn */
static inline bool find_syn_match(const struct xt_entry_match *m)
{
@@ -262,17 +277,9 @@ static inline bool find_syn_match(const struct xt_entry_match *m)
static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
{
- const struct xt_tcpmss_info *info = par->targinfo;
const struct ipt_entry *e = par->entryinfo;
const struct xt_entry_match *ematch;
- if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
- (par->hook_mask & ~((1 << NF_INET_FORWARD) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_POST_ROUTING))) != 0) {
- pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
- return -EINVAL;
- }
if (par->nft_compat)
return 0;
@@ -286,17 +293,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
{
- const struct xt_tcpmss_info *info = par->targinfo;
const struct ip6t_entry *e = par->entryinfo;
const struct xt_entry_match *ematch;
- if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
- (par->hook_mask & ~((1 << NF_INET_FORWARD) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_POST_ROUTING))) != 0) {
- pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
- return -EINVAL;
- }
if (par->nft_compat)
return 0;
@@ -312,6 +311,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = {
{
.family = NFPROTO_IPV4,
.name = "TCPMSS",
+ .check_hooks = tcpmss_tg4_check_hooks,
.checkentry = tcpmss_tg4_check,
.target = tcpmss_tg4,
.targetsize = sizeof(struct xt_tcpmss_info),
@@ -322,6 +322,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = {
{
.family = NFPROTO_IPV6,
.name = "TCPMSS",
+ .check_hooks = tcpmss_tg4_check_hooks,
.checkentry = tcpmss_tg6_check,
.target = tcpmss_tg6,
.targetsize = sizeof(struct xt_tcpmss_info),
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index e4bea1d346cf..5f60e7298a1e 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -86,6 +86,9 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info *tgi = par->targinfo;
+ if (par->fragoff)
+ return NF_DROP;
+
return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport,
tgi->mark_mask, tgi->mark_value);
}
@@ -95,6 +98,9 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+ if (par->fragoff)
+ return NF_DROP;
+
return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport,
tgi->mark_mask, tgi->mark_value);
}
@@ -106,6 +112,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+ unsigned short fragoff = 0;
struct udphdr _hdr, *hp;
struct sock *sk;
const struct in6_addr *laddr;
@@ -113,8 +120,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
int thoff = 0;
int tproto;
- tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0)
+ tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL);
+ if (tproto < 0 || fragoff)
return NF_DROP;
hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index a77088943107..913dbe3aa5e2 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -153,14 +153,10 @@ addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
return ret;
}
-static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
+static int addrtype_mt_check_hooks(const struct xt_mtchk_param *par)
{
- const char *errmsg = "both incoming and outgoing interface limitation cannot be selected";
struct xt_addrtype_info_v1 *info = par->matchinfo;
-
- if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN &&
- info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
- goto err;
+ const char *errmsg;
if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN)) &&
@@ -176,6 +172,21 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
goto err;
}
+ return 0;
+err:
+ pr_info_ratelimited("%s\n", errmsg);
+ return -EINVAL;
+}
+
+static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
+{
+ const char *errmsg = "both incoming and outgoing interface limitation cannot be selected";
+ struct xt_addrtype_info_v1 *info = par->matchinfo;
+
+ if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN &&
+ info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
+ goto err;
+
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
if (par->family == NFPROTO_IPV6) {
if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) {
@@ -211,6 +222,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.revision = 1,
.match = addrtype_mt_v1,
+ .check_hooks = addrtype_mt_check_hooks,
.checkentry = addrtype_mt_checkentry_v1,
.matchsize = sizeof(struct xt_addrtype_info_v1),
.me = THIS_MODULE
@@ -221,6 +233,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.revision = 1,
.match = addrtype_mt_v1,
+ .check_hooks = addrtype_mt_check_hooks,
.checkentry = addrtype_mt_checkentry_v1,
.matchsize = sizeof(struct xt_addrtype_info_v1),
.me = THIS_MODULE
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
index 9520dd00070b..6d1a44ab5eee 100644
--- a/net/netfilter/xt_devgroup.c
+++ b/net/netfilter/xt_devgroup.c
@@ -33,14 +33,10 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
return true;
}
-static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+static int devgroup_mt_check_hooks(const struct xt_mtchk_param *par)
{
const struct xt_devgroup_info *info = par->matchinfo;
- if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
- XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
- return -EINVAL;
-
if (info->flags & XT_DEVGROUP_MATCH_SRC &&
par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN) |
@@ -56,9 +52,21 @@ static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
return 0;
}
+static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
+ XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
+ return -EINVAL;
+
+ return 0;
+}
+
static struct xt_match devgroup_mt_reg __read_mostly = {
.name = "devgroup",
.match = devgroup_mt,
+ .check_hooks = devgroup_mt_check_hooks,
.checkentry = devgroup_mt_checkentry,
.matchsize = sizeof(struct xt_devgroup_info),
.family = NFPROTO_UNSPEC,
diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c
index b96e8203ac54..a8503f5d26bf 100644
--- a/net/netfilter/xt_ecn.c
+++ b/net/netfilter/xt_ecn.c
@@ -30,6 +30,10 @@ static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par)
struct tcphdr _tcph;
const struct tcphdr *th;
+ /* this is fine for IPv6 as ecn_mt_check6() enforces -p tcp */
+ if (par->fragoff)
+ return false;
+
/* In practice, TCP match does this, so can't fail. But let's
* be good citizens.
*/
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 3bd127bfc114..2704b4b60d1e 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -658,6 +658,8 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
if (!(hinfo->cfg.mode &
(XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
return 0;
+ if (ntohs(ip_hdr(skb)->frag_off) & IP_OFFSET)
+ return -1;
nexthdr = ip_hdr(skb)->protocol;
break;
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
@@ -681,7 +683,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
return 0;
nexthdr = ipv6_hdr(skb)->nexthdr;
protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off);
- if ((int)protoff < 0)
+ if ((int)protoff < 0 || ntohs(frag_off) & IP6_OFFSET)
return -1;
break;
}
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index dc9485854002..e8807caede68 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -27,6 +27,9 @@
static bool
xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
{
+ if (p->fragoff)
+ return false;
+
return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
xt_out(p), p->matchinfo, xt_net(p), nf_osf_fingers);
}
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index d2b0b52434fa..dd98f758176c 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -91,14 +91,10 @@ match_outdev:
return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT));
}
-static int physdev_mt_check(const struct xt_mtchk_param *par)
+static int physdev_mt_check_hooks(const struct xt_mtchk_param *par)
{
const struct xt_physdev_info *info = par->matchinfo;
- static bool brnf_probed __read_mostly;
- if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
- info->bitmask & ~XT_PHYSDEV_OP_MASK)
- return -EINVAL;
if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) &&
(!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) ||
info->invert & XT_PHYSDEV_OP_BRIDGED) &&
@@ -107,6 +103,18 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ return 0;
+}
+
+static int physdev_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_physdev_info *info = par->matchinfo;
+ static bool brnf_probed __read_mostly;
+
+ if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
+ info->bitmask & ~XT_PHYSDEV_OP_MASK)
+ return -EINVAL;
+
#define X(memb) strnlen(info->memb, sizeof(info->memb)) >= sizeof(info->memb)
if (info->bitmask & XT_PHYSDEV_OP_IN) {
if (info->physindev[0] == '\0')
@@ -141,6 +149,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = {
{
.name = "physdev",
.family = NFPROTO_IPV4,
+ .check_hooks = physdev_mt_check_hooks,
.checkentry = physdev_mt_check,
.match = physdev_mt,
.matchsize = sizeof(struct xt_physdev_info),
@@ -149,6 +158,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = {
{
.name = "physdev",
.family = NFPROTO_IPV6,
+ .check_hooks = physdev_mt_check_hooks,
.checkentry = physdev_mt_check,
.match = physdev_mt,
.matchsize = sizeof(struct xt_physdev_info),
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index b5fa65558318..ff54e3a8581e 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -126,13 +126,10 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ret;
}
-static int policy_mt_check(const struct xt_mtchk_param *par)
+static int policy_mt_check_hooks(const struct xt_mtchk_param *par)
{
const struct xt_policy_info *info = par->matchinfo;
- const char *errmsg = "neither incoming nor outgoing policy selected";
-
- if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT)))
- goto err;
+ const char *errmsg;
if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) {
@@ -144,6 +141,21 @@ static int policy_mt_check(const struct xt_mtchk_param *par)
errmsg = "input policy not valid in POSTROUTING and OUTPUT";
goto err;
}
+
+ return 0;
+err:
+ pr_info_ratelimited("%s\n", errmsg);
+ return -EINVAL;
+}
+
+static int policy_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_policy_info *info = par->matchinfo;
+ const char *errmsg = "neither incoming nor outgoing policy selected";
+
+ if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT)))
+ goto err;
+
if (info->len > XT_POLICY_MAX_ELEM) {
errmsg = "too many policy elements";
goto err;
@@ -158,6 +170,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = {
{
.name = "policy",
.family = NFPROTO_IPV4,
+ .check_hooks = policy_mt_check_hooks,
.checkentry = policy_mt_check,
.match = policy_mt,
.matchsize = sizeof(struct xt_policy_info),
@@ -166,6 +179,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = {
{
.name = "policy",
.family = NFPROTO_IPV6,
+ .check_hooks = policy_mt_check_hooks,
.checkentry = policy_mt_check,
.match = policy_mt,
.matchsize = sizeof(struct xt_policy_info),
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 731bc2cafae4..4ae04bba9358 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -431,6 +431,29 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
}
static int
+set_target_v3_check_hooks(const struct xt_tgchk_param *par)
+{
+ const struct xt_set_info_target_v3 *info = par->targinfo;
+
+ if (info->map_set.index != IPSET_INVALID_ID) {
+ if (strncmp(par->table, "mangle", 7)) {
+ pr_info_ratelimited("--map-set only usable from mangle table\n");
+ return -EINVAL;
+ }
+ if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
+ (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
+ (par->hook_mask & ~(1 << NF_INET_FORWARD |
+ 1 << NF_INET_LOCAL_OUT |
+ 1 << NF_INET_POST_ROUTING))) {
+ pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int
set_target_v3_checkentry(const struct xt_tgchk_param *par)
{
const struct xt_set_info_target_v3 *info = par->targinfo;
@@ -459,20 +482,6 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
}
if (info->map_set.index != IPSET_INVALID_ID) {
- if (strncmp(par->table, "mangle", 7)) {
- pr_info_ratelimited("--map-set only usable from mangle table\n");
- ret = -EINVAL;
- goto cleanup_del;
- }
- if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
- (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
- (par->hook_mask & ~(1 << NF_INET_FORWARD |
- 1 << NF_INET_LOCAL_OUT |
- 1 << NF_INET_POST_ROUTING))) {
- pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
- ret = -EINVAL;
- goto cleanup_del;
- }
index = ip_set_nfnl_get_byindex(par->net,
info->map_set.index);
if (index == IPSET_INVALID_ID) {
@@ -672,6 +681,7 @@ static struct xt_target set_targets[] __read_mostly = {
.family = NFPROTO_IPV4,
.target = set_target_v3,
.targetsize = sizeof(struct xt_set_info_target_v3),
+ .check_hooks = set_target_v3_check_hooks,
.checkentry = set_target_v3_checkentry,
.destroy = set_target_v3_destroy,
.me = THIS_MODULE
@@ -682,6 +692,7 @@ static struct xt_target set_targets[] __read_mostly = {
.family = NFPROTO_IPV6,
.target = set_target_v3,
.targetsize = sizeof(struct xt_set_info_target_v3),
+ .check_hooks = set_target_v3_check_hooks,
.checkentry = set_target_v3_checkentry,
.destroy = set_target_v3_destroy,
.me = THIS_MODULE
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
index 0d32d4841cb3..b9da8269161d 100644
--- a/net/netfilter/xt_tcpmss.c
+++ b/net/netfilter/xt_tcpmss.c
@@ -32,6 +32,10 @@ tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par)
u8 _opt[15 * 4 - sizeof(_tcph)];
unsigned int i, optlen;
+ /* this is fine for IPv6 as xt_tcpmss enforces -p tcp */
+ if (par->fragoff)
+ return false;
+
/* If we don't have the whole header, drop packet. */
th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
if (th == NULL)