diff options
| author | Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> | 2025-09-16 10:24:31 +0200 |
|---|---|---|
| committer | Paolo Abeni <pabeni@redhat.com> | 2025-09-18 08:47:52 +0200 |
| commit | aa55a7dde7ec506bb23448a5005ae3f4f809d022 (patch) | |
| tree | 41675fff92b54c414bbc37a465fc7dd8d4d46e0c /net | |
| parent | tcp: accecn: AccECN option (diff) | |
| download | linux-aa55a7dde7ec506bb23448a5005ae3f4f809d022.tar.gz linux-aa55a7dde7ec506bb23448a5005ae3f4f809d022.zip | |
tcp: accecn: AccECN option send control
Instead of sending the option in every ACK, limit sending to
those ACKs where the option is necessary:
- Handshake
- "Change-triggered ACK" + the ACK following it. The
2nd ACK is necessary to unambiguously indicate which
of the ECN byte counters in increasing. The first
ACK has two counters increasing due to the ecnfield
edge.
- ACKs with CE to allow CEP delta validations to take
advantage of the option.
- Force option to be sent every at least once per 2^22
bytes. The check is done using the bit edges of the
byte counters (avoids need for extra variables).
- AccECN option beacon to send a few times per RTT even if
nothing in the ECN state requires that. The default is 3
times per RTT, and its period can be set via
sysctl_tcp_ecn_option_beacon.
Below are the pahole outcomes before and after this patch,
in which the group size of tcp_sock_write_tx is increased
from 89 to 97 due to the new u64 accecn_opt_tstamp member:
[BEFORE THIS PATCH]
struct tcp_sock {
[...]
u64 tcp_wstamp_ns; /* 2488 8 */
struct list_head tsorted_sent_queue; /* 2496 16 */
[...]
__cacheline_group_end__tcp_sock_write_tx[0]; /* 2521 0 */
__cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */
u8 nonagle:4; /* 2521: 0 1 */
u8 rate_app_limited:1; /* 2521: 4 1 */
/* XXX 3 bits hole, try to pack */
/* Force alignment to the next boundary: */
u8 :0;
u8 received_ce_pending:4;/* 2522: 0 1 */
u8 unused2:4; /* 2522: 4 1 */
u8 accecn_minlen:2; /* 2523: 0 1 */
u8 est_ecnfield:2; /* 2523: 2 1 */
u8 unused3:4; /* 2523: 4 1 */
[...]
__cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */
[...]
/* size: 3200, cachelines: 50, members: 171 */
}
[AFTER THIS PATCH]
struct tcp_sock {
[...]
u64 tcp_wstamp_ns; /* 2488 8 */
u64 accecn_opt_tstamp; /* 2596 8 */
struct list_head tsorted_sent_queue; /* 2504 16 */
[...]
__cacheline_group_end__tcp_sock_write_tx[0]; /* 2529 0 */
__cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2529 0 */
u8 nonagle:4; /* 2529: 0 1 */
u8 rate_app_limited:1; /* 2529: 4 1 */
/* XXX 3 bits hole, try to pack */
/* Force alignment to the next boundary: */
u8 :0;
u8 received_ce_pending:4;/* 2530: 0 1 */
u8 unused2:4; /* 2530: 4 1 */
u8 accecn_minlen:2; /* 2531: 0 1 */
u8 est_ecnfield:2; /* 2531: 2 1 */
u8 accecn_opt_demand:2; /* 2531: 4 1 */
u8 prev_ecnfield:2; /* 2531: 6 1 */
[...]
__cacheline_group_end__tcp_sock_write_txrx[0]; /* 2636 0 */
[...]
/* size: 3200, cachelines: 50, members: 173 */
}
Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Co-developed-by: Ilpo Järvinen <ij@kernel.org>
Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-8-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Diffstat (limited to 'net')
| -rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 9 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 5 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 4 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 1 | ||||
| -rw-r--r-- | net/ipv4/tcp_minisocks.c | 2 | ||||
| -rw-r--r-- | net/ipv4/tcp_output.c | 26 |
6 files changed, 39 insertions, 8 deletions
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4a697acb4e85..24dbc603cc44 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -741,6 +741,15 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = SYSCTL_TWO, }, { + .procname = "tcp_ecn_option_beacon", + .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, .maxlen = sizeof(u8), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8c4a4b8666fc..090f9ac43d4c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3410,6 +3410,8 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); + tp->prev_ecnfield = 0; + tp->accecn_opt_tstamp = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5134,11 +5136,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e898a76c485e..87154fd86167 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6121,8 +6121,10 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { - if (tcp_ecn_mode_accecn(tp)) + if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; + tcp_accecn_opt_demand_min(sk, 1); + } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index aa8dbfe20924..6a63be1f6461 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3562,6 +3562,7 @@ static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; + net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1dbcc09ff7a9..193343494558 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,8 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 34e5c83bbace..f897c2594954 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -705,8 +705,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(((e0b & 0xffffff) << 8) | TCPOPT_NOP); } - if (tp) + if (tp) { tp->accecn_minlen = 0; + tp->accecn_opt_tstamp = tp->tcp_mstamp; + if (tp->accecn_opt_demand) + tp->accecn_opt_demand--; + } } if (unlikely(OPTION_SACK_ADVERTISE & options)) { @@ -1149,11 +1153,16 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } - if (tcp_ecn_mode_accecn(tp) && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { - opts->use_synack_ecn_bytes = 0; - size += tcp_options_fit_accecn(opts, tp->accecn_minlen, - MAX_TCP_OPTION_SPACE - size); + if (tcp_ecn_mode_accecn(tp)) { + int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); + + if (ecn_opt && + (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || + tcp_accecn_option_beacon_check(sk))) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, @@ -2863,6 +2872,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0; tcp_mstamp_refresh(tp); + + /* AccECN option beacon depends on mstamp, it may change mss */ + if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk)) + mss_now = tcp_current_mss(sk); + if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); |
