aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorChia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>2025-09-16 10:24:31 +0200
committerPaolo Abeni <pabeni@redhat.com>2025-09-18 08:47:52 +0200
commitaa55a7dde7ec506bb23448a5005ae3f4f809d022 (patch)
tree41675fff92b54c414bbc37a465fc7dd8d4d46e0c /net
parenttcp: accecn: AccECN option (diff)
downloadlinux-aa55a7dde7ec506bb23448a5005ae3f4f809d022.tar.gz
linux-aa55a7dde7ec506bb23448a5005ae3f4f809d022.zip
tcp: accecn: AccECN option send control
Instead of sending the option in every ACK, limit sending to those ACKs where the option is necessary: - Handshake - "Change-triggered ACK" + the ACK following it. The 2nd ACK is necessary to unambiguously indicate which of the ECN byte counters in increasing. The first ACK has two counters increasing due to the ecnfield edge. - ACKs with CE to allow CEP delta validations to take advantage of the option. - Force option to be sent every at least once per 2^22 bytes. The check is done using the bit edges of the byte counters (avoids need for extra variables). - AccECN option beacon to send a few times per RTT even if nothing in the ECN state requires that. The default is 3 times per RTT, and its period can be set via sysctl_tcp_ecn_option_beacon. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_tx is increased from 89 to 97 due to the new u64 accecn_opt_tstamp member: [BEFORE THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ struct list_head tsorted_sent_queue; /* 2496 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2521 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ u64 accecn_opt_tstamp; /* 2596 8 */ struct list_head tsorted_sent_queue; /* 2504 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2529 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2529 0 */ u8 nonagle:4; /* 2529: 0 1 */ u8 rate_app_limited:1; /* 2529: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2530: 0 1 */ u8 unused2:4; /* 2530: 4 1 */ u8 accecn_minlen:2; /* 2531: 0 1 */ u8 est_ecnfield:2; /* 2531: 2 1 */ u8 accecn_opt_demand:2; /* 2531: 4 1 */ u8 prev_ecnfield:2; /* 2531: 6 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2636 0 */ [...] /* size: 3200, cachelines: 50, members: 173 */ } Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> Co-developed-by: Ilpo Järvinen <ij@kernel.org> Signed-off-by: Ilpo Järvinen <ij@kernel.org> Reviewed-by: Eric Dumazet <edumazet@google.com> Link: https://patch.msgid.link/20250916082434.100722-8-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp.c5
-rw-r--r--net/ipv4/tcp_input.c4
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c26
6 files changed, 39 insertions, 8 deletions
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4a697acb4e85..24dbc603cc44 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -741,6 +741,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = SYSCTL_TWO,
},
{
+ .procname = "tcp_ecn_option_beacon",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
+ },
+ {
.procname = "tcp_ecn_fallback",
.data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
.maxlen = sizeof(u8),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c4a4b8666fc..090f9ac43d4c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3410,6 +3410,8 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->delivered_ce = 0;
tp->accecn_fail_mode = 0;
tcp_accecn_init_counters(tp);
+ tp->prev_ecnfield = 0;
+ tp->accecn_opt_tstamp = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -5134,11 +5136,12 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97);
/* TXRX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e898a76c485e..87154fd86167 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6121,8 +6121,10 @@ step1:
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
- if (tcp_ecn_mode_accecn(tp))
+ if (tcp_ecn_mode_accecn(tp)) {
accecn_reflector = true;
+ tcp_accecn_opt_demand_min(sk, 1);
+ }
if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index aa8dbfe20924..6a63be1f6461 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3562,6 +3562,7 @@ static int __net_init tcp_sk_init(struct net *net)
{
net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
+ net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1dbcc09ff7a9..193343494558 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -463,6 +463,8 @@ static void tcp_ecn_openreq_child(struct sock *sk,
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_snt = treq->syn_ect_snt;
tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+ tp->prev_ecnfield = treq->syn_ect_rcv;
+ tp->accecn_opt_demand = 1;
tcp_ecn_received_counters_payload(sk, skb);
} else {
tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 34e5c83bbace..f897c2594954 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -705,8 +705,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
*ptr++ = htonl(((e0b & 0xffffff) << 8) |
TCPOPT_NOP);
}
- if (tp)
+ if (tp) {
tp->accecn_minlen = 0;
+ tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ if (tp->accecn_opt_demand)
+ tp->accecn_opt_demand--;
+ }
}
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
@@ -1149,11 +1153,16 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->num_sack_blocks = 0;
}
- if (tcp_ecn_mode_accecn(tp) &&
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) {
- opts->use_synack_ecn_bytes = 0;
- size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
- MAX_TCP_OPTION_SPACE - size);
+ if (tcp_ecn_mode_accecn(tp)) {
+ int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
+
+ if (ecn_opt &&
+ (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
+ tcp_accecn_option_beacon_check(sk))) {
+ opts->use_synack_ecn_bytes = 0;
+ size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
+ MAX_TCP_OPTION_SPACE - size);
+ }
}
if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
@@ -2863,6 +2872,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
sent_pkts = 0;
tcp_mstamp_refresh(tp);
+
+ /* AccECN option beacon depends on mstamp, it may change mss */
+ if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk))
+ mss_now = tcp_current_mss(sk);
+
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);