From 41a6e8ab18642741437da932c2f5762b185e928c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 13 Aug 2025 12:44:17 +0300 Subject: devlink/port: Check attributes early and constify Constify the devlink port attributes to indicate they are read only and does not depend on anything else. Therefore, validate it early before setting in the devlink port. Reviewed-by: Jiri Pirko Signed-off-by: Parav Pandit Link: https://patch.msgid.link/20250813094417.7269-3-parav@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index b32c9ceeb81d..3119d053bc4d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1743,7 +1743,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *devlink_port_attrs); + const struct devlink_port_attrs *attrs); void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external); void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, -- cgit v1.2.3 From 52bf272636bda69587952b35ae97690b8dc89941 Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 12 Aug 2025 23:57:57 +0000 Subject: net/sched: Fix backlog accounting in qdisc_dequeue_internal This issue applies for the following qdiscs: hhf, fq, fq_codel, and fq_pie, and occurs in their change handlers when adjusting to the new limit. The problem is the following in the values passed to the subsequent qdisc_tree_reduce_backlog call given a tbf parent: When the tbf parent runs out of tokens, skbs of these qdiscs will be placed in gso_skb. Their peek handlers are qdisc_peek_dequeued, which accounts for both qlen and backlog. However, in the case of qdisc_dequeue_internal, ONLY qlen is accounted for when pulling from gso_skb. This means that these qdiscs are missing a qdisc_qstats_backlog_dec when dropping packets to satisfy the new limit in their change handlers. One can observe this issue with the following (with tc patched to support a limit of 0): export TARGET=fq tc qdisc del dev lo root tc qdisc add dev lo root handle 1: tbf rate 8bit burst 100b latency 1ms tc qdisc replace dev lo handle 3: parent 1:1 $TARGET limit 1000 echo ''; echo 'add child'; tc -s -d qdisc show dev lo ping -I lo -f -c2 -s32 -W0.001 127.0.0.1 2>&1 >/dev/null echo ''; echo 'after ping'; tc -s -d qdisc show dev lo tc qdisc change dev lo handle 3: parent 1:1 $TARGET limit 0 echo ''; echo 'after limit drop'; tc -s -d qdisc show dev lo tc qdisc replace dev lo handle 2: parent 1:1 sfq echo ''; echo 'post graft'; tc -s -d qdisc show dev lo The second to last show command shows 0 packets but a positive number (74) of backlog bytes. The problem becomes clearer in the last show command, where qdisc_purge_queue triggers qdisc_tree_reduce_backlog with the positive backlog and causes an underflow in the tbf parent's backlog (4096 Mb instead of 0). To fix this issue, the codepath for all clients of qdisc_dequeue_internal has been simplified: codel, pie, hhf, fq, fq_pie, and fq_codel. qdisc_dequeue_internal handles the backlog adjustments for all cases that do not directly use the dequeue handler. The old fq_codel_change limit adjustment loop accumulated the arguments to the subsequent qdisc_tree_reduce_backlog call through the cstats field. However, this is confusing and error prone as fq_codel_dequeue could also potentially mutate this field (which qdisc_dequeue_internal calls in the non gso_skb case), so we have unified the code here with other qdiscs. Fixes: 2d3cbfd6d54a ("net_sched: Flush gso_skb list too during ->change()") Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM") Fixes: 10239edf86f1 ("net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc") Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Link: https://patch.msgid.link/20250812235725.45243-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 11 ++++++++--- net/sched/sch_codel.c | 12 +++++++----- net/sched/sch_fq.c | 12 +++++++----- net/sched/sch_fq_codel.c | 12 +++++++----- net/sched/sch_fq_pie.c | 12 +++++++----- net/sched/sch_hhf.c | 12 +++++++----- net/sched/sch_pie.c | 12 +++++++----- 7 files changed, 50 insertions(+), 33 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 638948be4c50..738cd5b13c62 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1038,12 +1038,17 @@ static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool dir skb = __skb_dequeue(&sch->gso_skb); if (skb) { sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); return skb; } - if (direct) - return __qdisc_dequeue_head(&sch->q); - else + if (direct) { + skb = __qdisc_dequeue_head(&sch->q); + if (skb) + qdisc_qstats_backlog_dec(sch, skb); + return skb; + } else { return sch->dequeue(sch); + } } static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index c93761040c6e..fa0314679e43 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -101,9 +101,9 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { static int codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CODEL_MAX + 1]; - unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_CODEL_MAX, opt, @@ -142,15 +142,17 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->params.ecn, !!nla_get_u32(tb[TCA_CODEL_ECN])); - qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); - dropped += qdisc_pkt_len(skb); - qdisc_qstats_backlog_dec(sch, skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_qdisc_drop(skb, sch); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 902ff5470607..fee922da2f99 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -1013,11 +1013,11 @@ static int fq_load_priomap(struct fq_sched_data *q, static int fq_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_MAX + 1]; - int err, drop_count = 0; - unsigned drop_len = 0; u32 fq_log; + int err; err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, NULL); @@ -1135,16 +1135,18 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = fq_resize(sch, fq_log); sch_tree_lock(sch); } + while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; - drop_len += qdisc_pkt_len(skb); + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); - drop_count++; } - qdisc_tree_reduce_backlog(sch, drop_count, drop_len); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 2a0f3a513bfa..a14142392939 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -366,6 +366,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; u32 quantum = 0; @@ -443,13 +444,14 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->memory_usage > q->memory_limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); - q->cstats.drop_len += qdisc_pkt_len(skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); - q->cstats.drop_count++; } - qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); - q->cstats.drop_count = 0; - q->cstats.drop_len = 0; + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index b0e34daf1f75..7b96bc3ff891 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -287,10 +287,9 @@ begin: static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_PIE_MAX + 1]; - unsigned int len_dropped = 0; - unsigned int num_dropped = 0; int err; err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack); @@ -368,11 +367,14 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); - len_dropped += qdisc_pkt_len(skb); - num_dropped += 1; + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } - qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 5aa434b46707..2d4855e28a28 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -508,9 +508,9 @@ static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; - unsigned int qlen, prev_backlog; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; @@ -561,15 +561,17 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, usecs_to_jiffies(us)); } - qlen = sch->q.qlen; - prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, - prev_backlog - sch->qstats.backlog); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index ad46ee3ed5a9..0a377313b6a9 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -141,9 +141,9 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { static int pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; - unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy, @@ -193,15 +193,17 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ - qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); - dropped += qdisc_pkt_len(skb); - qdisc_qstats_backlog_dec(sch, skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_qdisc_drop(skb, sch); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; -- cgit v1.2.3 From 709788b154caf042874d765628ffa860f0bb0d1e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 4 Aug 2025 09:54:05 -0400 Subject: Bluetooth: hci_core: Fix using {cis,bis}_capable for current settings {cis,bis}_capable only indicates the controller supports the feature since it doesn't check that LE is enabled so it shall not be used for current setting, instead this introduces {cis,bis}_enabled macros that can be used to indicate that these features are currently enabled. Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Fixes: ae7533613133 ("Bluetooth: Check for ISO support in controller") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 4 ++-- include/net/bluetooth/hci_core.h | 13 ++++++++++++- net/bluetooth/hci_sync.c | 4 ++-- net/bluetooth/iso.c | 14 +++++++------- net/bluetooth/mgmt.c | 10 +++++----- 5 files changed, 28 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index ada5b56a4413..e5751f3070b8 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -647,7 +647,7 @@ static inline void sco_exit(void) #if IS_ENABLED(CONFIG_BT_LE) int iso_init(void); int iso_exit(void); -bool iso_enabled(void); +bool iso_inited(void); #else static inline int iso_init(void) { @@ -659,7 +659,7 @@ static inline int iso_exit(void) return 0; } -static inline bool iso_enabled(void) +static inline bool iso_inited(void) { return false; } diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 4dc11c66f7b8..bc29f2e2e16f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1915,6 +1915,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); !hci_dev_test_flag(dev, HCI_RPA_EXPIRED)) #define adv_rpa_valid(adv) (bacmp(&adv->random_addr, BDADDR_ANY) && \ !adv->rpa_expired) +#define le_enabled(dev) (lmp_le_capable(dev) && \ + hci_dev_test_flag(dev, HCI_LE_ENABLED)) #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) @@ -1981,14 +1983,23 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* CIS Master/Slave and BIS support */ #define iso_capable(dev) (cis_capable(dev) || bis_capable(dev)) +#define iso_enabled(dev) (le_enabled(dev) && iso_capable(dev)) #define cis_capable(dev) \ (cis_central_capable(dev) || cis_peripheral_capable(dev)) +#define cis_enabled(dev) (le_enabled(dev) && cis_capable(dev)) #define cis_central_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_CENTRAL) +#define cis_central_enabled(dev) \ + (le_enabled(dev) && cis_central_capable(dev)) #define cis_peripheral_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) +#define cis_peripheral_enabled(dev) \ + (le_enabled(dev) && cis_peripheral_capable(dev)) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) -#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define bis_enabled(dev) (le_enabled(dev) && bis_capable(dev)) +#define sync_recv_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define sync_recv_enabled(dev) (le_enabled(dev) && sync_recv_capable(dev)) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 387c128f2ba0..aa7d7a8ec3ee 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4531,14 +4531,14 @@ static int hci_le_set_host_feature_sync(struct hci_dev *hdev) { struct hci_cp_le_set_host_feature cp; - if (!cis_capable(hdev)) + if (!iso_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); /* Connected Isochronous Channels (Host Support) */ cp.bit_number = 32; - cp.bit_value = 1; + cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index eaffd25570e3..5ce823ca3aaf 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2483,11 +2483,11 @@ static const struct net_proto_family iso_sock_family_ops = { .create = iso_sock_create, }; -static bool iso_inited; +static bool inited; -bool iso_enabled(void) +bool iso_inited(void) { - return iso_inited; + return inited; } int iso_init(void) @@ -2496,7 +2496,7 @@ int iso_init(void) BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr)); - if (iso_inited) + if (inited) return -EALREADY; err = proto_register(&iso_proto, 0); @@ -2524,7 +2524,7 @@ int iso_init(void) iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); - iso_inited = true; + inited = true; return 0; @@ -2535,7 +2535,7 @@ error: int iso_exit(void) { - if (!iso_inited) + if (!inited) return -EALREADY; bt_procfs_cleanup(&init_net, "iso"); @@ -2549,7 +2549,7 @@ int iso_exit(void) proto_unregister(&iso_proto); - iso_inited = false; + inited = false; return 0; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1ce682038b51..c42dffe77daf 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -922,16 +922,16 @@ static u32 get_current_settings(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) settings |= MGMT_SETTING_WIDEBAND_SPEECH; - if (cis_central_capable(hdev)) + if (cis_central_enabled(hdev)) settings |= MGMT_SETTING_CIS_CENTRAL; - if (cis_peripheral_capable(hdev)) + if (cis_peripheral_enabled(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; - if (bis_capable(hdev)) + if (bis_enabled(hdev)) settings |= MGMT_SETTING_ISO_BROADCASTER; - if (sync_recv_capable(hdev)) + if (sync_recv_enabled(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; if (ll_privacy_capable(hdev)) @@ -4513,7 +4513,7 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, } if (IS_ENABLED(CONFIG_BT_LE)) { - flags = iso_enabled() ? BIT(0) : 0; + flags = iso_inited() ? BIT(0) : 0; memcpy(rp->features[idx].uuid, iso_socket_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; -- cgit v1.2.3 From 3dcf7175f2c04bd3a7d50db3fa42a0bd933b6e23 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 4 Aug 2025 14:05:03 -0400 Subject: Bluetooth: hci_core: Fix using ll_privacy_capable for current settings ll_privacy_capable only indicates that the controller supports the feature but it doesnt' check that LE is enabled so it end up being marked as active in the current settings when it shouldn't. Fixes: ad383c2c65a5 ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/mgmt.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index bc29f2e2e16f..bb30bde6f0e8 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1934,6 +1934,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) +#define ll_privacy_enabled(dev) (le_enabled(dev) && ll_privacy_capable(dev)) #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ ((dev)->commands[39] & 0x04)) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c42dffe77daf..3166f5fb876b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -934,7 +934,7 @@ static u32 get_current_settings(struct hci_dev *hdev) if (sync_recv_enabled(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; - if (ll_privacy_capable(hdev)) + if (ll_privacy_enabled(hdev)) settings |= MGMT_SETTING_LL_PRIVACY; return settings; -- cgit v1.2.3 From 9d4b01a0bf8d2163ae129c9c537cb0753ad5a2aa Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 14 Aug 2025 11:57:19 -0400 Subject: Bluetooth: hci_core: Fix not accounting for BIS/CIS/PA links separately This fixes the likes of hci_conn_num(CIS_LINK) returning the total of ISO connection which includes BIS_LINK as well, so this splits the iso_num into each link type and introduces hci_iso_num that can be used in places where the total number of ISO connection still needs to be used. Fixes: 23205562ffc8 ("Bluetooth: separate CIS_LINK and BIS_LINK link types") Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index bb30bde6f0e8..6906af7a8f24 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -129,7 +129,9 @@ struct hci_conn_hash { struct list_head list; unsigned int acl_num; unsigned int sco_num; - unsigned int iso_num; + unsigned int cis_num; + unsigned int bis_num; + unsigned int pa_num; unsigned int le_num; unsigned int le_num_peripheral; }; @@ -1014,9 +1016,13 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) h->sco_num++; break; case CIS_LINK: + h->cis_num++; + break; case BIS_LINK: + h->bis_num++; + break; case PA_LINK: - h->iso_num++; + h->pa_num++; break; } } @@ -1042,9 +1048,13 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) h->sco_num--; break; case CIS_LINK: + h->cis_num--; + break; case BIS_LINK: + h->bis_num--; + break; case PA_LINK: - h->iso_num--; + h->pa_num--; break; } } @@ -1061,9 +1071,11 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) case ESCO_LINK: return h->sco_num; case CIS_LINK: + return h->cis_num; case BIS_LINK: + return h->bis_num; case PA_LINK: - return h->iso_num; + return h->pa_num; default: return 0; } @@ -1073,7 +1085,15 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev) { struct hci_conn_hash *c = &hdev->conn_hash; - return c->acl_num + c->sco_num + c->le_num + c->iso_num; + return c->acl_num + c->sco_num + c->le_num + c->cis_num + c->bis_num + + c->pa_num; +} + +static inline unsigned int hci_iso_count(struct hci_dev *hdev) +{ + struct hci_conn_hash *c = &hdev->conn_hash; + + return c->cis_num + c->bis_num; } static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn) -- cgit v1.2.3 From 730ff06d3f5cc2ce0348414b78c10528b767d4a3 Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Thu, 14 Aug 2025 07:04:10 -0700 Subject: net: mana: Use page pool fragments for RX buffers instead of full pages to improve memory efficiency. This patch enhances RX buffer handling in the mana driver by allocating pages from a page pool and slicing them into MTU-sized fragments, rather than dedicating a full page per packet. This approach is especially beneficial on systems with large base page sizes like 64KB. Key improvements: - Proper integration of page pool for RX buffer allocations. - MTU-sized buffer slicing to improve memory utilization. - Reduce overall per Rx queue memory footprint. - Automatic fallback to full-page buffers when: * Jumbo frames are enabled (MTU > PAGE_SIZE / 2). * The XDP path is active, to avoid complexities with fragment reuse. Testing on VMs with 64KB pages shows around 200% throughput improvement. Memory efficiency is significantly improved due to reduced wastage in page allocations. Example: We are now able to fit 35 rx buffers in a single 64kb page for MTU size of 1500, instead of 1 rx buffer per page previously. Tested: - iperf3, iperf2, and nttcp benchmarks. - Jumbo frames with MTU 9000. - Native XDP programs (XDP_PASS, XDP_DROP, XDP_TX, XDP_REDIRECT) for testing the XDP path in driver. - Memory leak detection (kmemleak). - Driver load/unload, reboot, and stress scenarios. Reviewed-by: Jacob Keller Reviewed-by: Saurabh Sengar Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/20250814140410.GA22089@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_bpf.c | 46 +++++++- drivers/net/ethernet/microsoft/mana/mana_en.c | 151 +++++++++++++++++-------- include/net/mana/mana.h | 4 + 3 files changed, 150 insertions(+), 51 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c b/drivers/net/ethernet/microsoft/mana/mana_bpf.c index d30721d4516f..7697c9b52ed3 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c +++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c @@ -174,6 +174,7 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog, struct mana_port_context *apc = netdev_priv(ndev); struct bpf_prog *old_prog; struct gdma_context *gc; + int err; gc = apc->ac->gdma_dev->gdma_context; @@ -195,11 +196,45 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog, */ apc->bpf_prog = prog; - if (old_prog) - bpf_prog_put(old_prog); + if (apc->port_is_up) { + /* Re-create rxq's after xdp prog was loaded or unloaded. + * Ex: re create rxq's to switch from full pages to smaller + * size page fragments when xdp prog is unloaded and + * vice-versa. + */ + + /* Pre-allocate buffers to prevent failure in mana_attach */ + err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "XDP: Insufficient memory for tx/rx re-config"); + return err; + } + + err = mana_detach(ndev, false); + if (err) { + netdev_err(ndev, + "mana_detach failed at xdp set: %d\n", err); + NL_SET_ERR_MSG_MOD(extack, + "XDP: Re-config failed at detach"); + goto err_dealloc_rxbuffs; + } + + err = mana_attach(ndev); + if (err) { + netdev_err(ndev, + "mana_attach failed at xdp set: %d\n", err); + NL_SET_ERR_MSG_MOD(extack, + "XDP: Re-config failed at attach"); + goto err_dealloc_rxbuffs; + } - if (apc->port_is_up) mana_chn_setxdp(apc, prog); + mana_pre_dealloc_rxbufs(apc); + } + + if (old_prog) + bpf_prog_put(old_prog); if (prog) ndev->max_mtu = MANA_XDP_MTU_MAX; @@ -207,6 +242,11 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog, ndev->max_mtu = gc->adapter_mtu - ETH_HLEN; return 0; + +err_dealloc_rxbuffs: + apc->bpf_prog = old_prog; + mana_pre_dealloc_rxbufs(apc); + return err; } int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 550843e2164b..f4fc86f20213 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -57,6 +57,15 @@ static bool mana_en_need_log(struct mana_port_context *apc, int err) return true; } +static void mana_put_rx_page(struct mana_rxq *rxq, struct page *page, + bool from_pool) +{ + if (from_pool) + page_pool_put_full_page(rxq->page_pool, page, false); + else + put_page(page); +} + /* Microsoft Azure Network Adapter (MANA) functions */ static int mana_open(struct net_device *ndev) @@ -630,21 +639,40 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da) } /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */ -static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size, - u32 *headroom) +static void mana_get_rxbuf_cfg(struct mana_port_context *apc, + int mtu, u32 *datasize, u32 *alloc_size, + u32 *headroom, u32 *frag_count) { - if (mtu > MANA_XDP_MTU_MAX) - *headroom = 0; /* no support for XDP */ - else - *headroom = XDP_PACKET_HEADROOM; + u32 len, buf_size; - *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); + /* Calculate datasize first (consistent across all cases) */ + *datasize = mtu + ETH_HLEN; - /* Using page pool in this case, so alloc_size is PAGE_SIZE */ - if (*alloc_size < PAGE_SIZE) - *alloc_size = PAGE_SIZE; + /* For xdp and jumbo frames make sure only one packet fits per page */ + if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) { + if (mana_xdp_get(apc)) { + *headroom = XDP_PACKET_HEADROOM; + *alloc_size = PAGE_SIZE; + } else { + *headroom = 0; /* no support for XDP */ + *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + + *headroom); + } - *datasize = mtu + ETH_HLEN; + *frag_count = 1; + return; + } + + /* Standard MTU case - optimize for multiple packets per page */ + *headroom = 0; + + /* Calculate base buffer size needed */ + len = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); + buf_size = ALIGN(len, MANA_RX_FRAG_ALIGNMENT); + + /* Calculate how many packets can fit in a page */ + *frag_count = PAGE_SIZE / buf_size; + *alloc_size = buf_size; } int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_queues) @@ -656,8 +684,9 @@ int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_qu void *va; int i; - mana_get_rxbuf_cfg(new_mtu, &mpc->rxbpre_datasize, - &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom); + mana_get_rxbuf_cfg(mpc, new_mtu, &mpc->rxbpre_datasize, + &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom, + &mpc->rxbpre_frag_count); dev = mpc->ac->gdma_dev->gdma_context->dev; @@ -1842,8 +1871,11 @@ drop_xdp: drop: if (from_pool) { - page_pool_recycle_direct(rxq->page_pool, - virt_to_head_page(buf_va)); + if (rxq->frag_count == 1) + page_pool_recycle_direct(rxq->page_pool, + virt_to_head_page(buf_va)); + else + page_pool_free_va(rxq->page_pool, buf_va, true); } else { WARN_ON_ONCE(rxq->xdp_save_va); /* Save for reuse */ @@ -1859,33 +1891,46 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev, dma_addr_t *da, bool *from_pool) { struct page *page; + u32 offset; void *va; - *from_pool = false; - /* Reuse XDP dropped page if available */ - if (rxq->xdp_save_va) { - va = rxq->xdp_save_va; - rxq->xdp_save_va = NULL; - } else { - page = page_pool_dev_alloc_pages(rxq->page_pool); - if (!page) + /* Don't use fragments for jumbo frames or XDP where it's 1 fragment + * per page. + */ + if (rxq->frag_count == 1) { + /* Reuse XDP dropped page if available */ + if (rxq->xdp_save_va) { + va = rxq->xdp_save_va; + page = virt_to_head_page(va); + rxq->xdp_save_va = NULL; + } else { + page = page_pool_dev_alloc_pages(rxq->page_pool); + if (!page) + return NULL; + + *from_pool = true; + va = page_to_virt(page); + } + + *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize, + DMA_FROM_DEVICE); + if (dma_mapping_error(dev, *da)) { + mana_put_rx_page(rxq, page, *from_pool); return NULL; + } - *from_pool = true; - va = page_to_virt(page); + return va; } - *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize, - DMA_FROM_DEVICE); - if (dma_mapping_error(dev, *da)) { - if (*from_pool) - page_pool_put_full_page(rxq->page_pool, page, false); - else - put_page(virt_to_head_page(va)); - + page = page_pool_dev_alloc_frag(rxq->page_pool, &offset, + rxq->alloc_size); + if (!page) return NULL; - } + + va = page_to_virt(page) + offset; + *da = page_pool_get_dma_addr(page) + offset + rxq->headroom; + *from_pool = true; return va; } @@ -1902,9 +1947,9 @@ static void mana_refill_rx_oob(struct device *dev, struct mana_rxq *rxq, va = mana_get_rxfrag(rxq, dev, &da, &from_pool); if (!va) return; - - dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize, - DMA_FROM_DEVICE); + if (!rxoob->from_pool || rxq->frag_count == 1) + dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize, + DMA_FROM_DEVICE); *old_buf = rxoob->buf_va; *old_fp = rxoob->from_pool; @@ -2315,15 +2360,15 @@ static void mana_destroy_rxq(struct mana_port_context *apc, if (!rx_oob->buf_va) continue; - dma_unmap_single(dev, rx_oob->sgl[0].address, - rx_oob->sgl[0].size, DMA_FROM_DEVICE); - page = virt_to_head_page(rx_oob->buf_va); - if (rx_oob->from_pool) - page_pool_put_full_page(rxq->page_pool, page, false); - else - put_page(page); + if (rxq->frag_count == 1 || !rx_oob->from_pool) { + dma_unmap_single(dev, rx_oob->sgl[0].address, + rx_oob->sgl[0].size, DMA_FROM_DEVICE); + mana_put_rx_page(rxq, page, rx_oob->from_pool); + } else { + page_pool_free_va(rxq->page_pool, rx_oob->buf_va, true); + } rx_oob->buf_va = NULL; } @@ -2429,11 +2474,22 @@ static int mana_create_page_pool(struct mana_rxq *rxq, struct gdma_context *gc) struct page_pool_params pprm = {}; int ret; - pprm.pool_size = mpc->rx_queue_size; + pprm.pool_size = mpc->rx_queue_size / rxq->frag_count + 1; pprm.nid = gc->numa_node; pprm.napi = &rxq->rx_cq.napi; pprm.netdev = rxq->ndev; pprm.order = get_order(rxq->alloc_size); + pprm.queue_idx = rxq->rxq_idx; + pprm.dev = gc->dev; + + /* Let the page pool do the dma map when page sharing with multiple + * fragments enabled for rx buffers. + */ + if (rxq->frag_count > 1) { + pprm.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pprm.max_len = PAGE_SIZE; + pprm.dma_dir = DMA_FROM_DEVICE; + } rxq->page_pool = page_pool_create(&pprm); @@ -2472,9 +2528,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, rxq->rxq_idx = rxq_idx; rxq->rxobj = INVALID_MANA_HANDLE; - mana_get_rxbuf_cfg(ndev->mtu, &rxq->datasize, &rxq->alloc_size, - &rxq->headroom); - + mana_get_rxbuf_cfg(apc, ndev->mtu, &rxq->datasize, &rxq->alloc_size, + &rxq->headroom, &rxq->frag_count); /* Create page pool for RX queue */ err = mana_create_page_pool(rxq, gc); if (err) { diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index e1030a7d2daa..0921485565c0 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -65,6 +65,8 @@ enum TRI_STATE { #define MANA_STATS_RX_COUNT 5 #define MANA_STATS_TX_COUNT 11 +#define MANA_RX_FRAG_ALIGNMENT 64 + struct mana_stats_rx { u64 packets; u64 bytes; @@ -328,6 +330,7 @@ struct mana_rxq { u32 datasize; u32 alloc_size; u32 headroom; + u32 frag_count; mana_handle_t rxobj; @@ -510,6 +513,7 @@ struct mana_port_context { u32 rxbpre_datasize; u32 rxbpre_alloc_size; u32 rxbpre_headroom; + u32 rxbpre_frag_count; struct bpf_prog *bpf_prog; -- cgit v1.2.3 From f7161b234f2ec7f18999009c4becc04eeb6b12a7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:14 +0000 Subject: net-memcg: Introduce mem_cgroup_from_sk(). We will store a flag in the lowest bit of sk->sk_memcg. Then, directly dereferencing sk->sk_memcg will be illegal, and we do not want to allow touching the raw sk->sk_memcg in many places. Let's introduce mem_cgroup_from_sk(). Other places accessing the raw sk->sk_memcg will be converted later. Note that we cannot define the helper as an inline function in memcontrol.h as we cannot access any fields of struct sock there due to circular dependency, so it is placed in sock.h. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 12 ++++++++++++ mm/memcontrol.c | 13 +++++++++---- net/ipv4/inet_connection_sock.c | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index c8a4b283df6f..811f95ea8d00 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2594,6 +2594,18 @@ static inline gfp_t gfp_memcg_charge(void) return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return sk->sk_memcg; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return NULL; +} +#endif + static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46713b9ece06..d8a52d1d08fa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5020,19 +5020,24 @@ out: void mem_cgroup_sk_free(struct sock *sk) { - if (sk->sk_memcg) - css_put(&sk->sk_memcg->css); + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + + if (memcg) + css_put(&memcg->css); } void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) { + struct mem_cgroup *memcg; + if (sk->sk_memcg == newsk->sk_memcg) return; mem_cgroup_sk_free(newsk); - if (sk->sk_memcg) - css_get(&sk->sk_memcg->css); + memcg = mem_cgroup_from_sk(sk); + if (memcg) + css_get(&memcg->css); newsk->sk_memcg = sk->sk_memcg; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 724bd9ed6cd4..93569bbe00f4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -718,7 +718,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) lock_sock(newsk); mem_cgroup_sk_alloc(newsk); - if (newsk->sk_memcg) { + if (mem_cgroup_from_sk(newsk)) { /* The socket has not been accepted yet, no need * to look at newsk->sk_wmem_queued. */ -- cgit v1.2.3 From 43049b0db03823c2cd003ca7d3dddcd3924da8dc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:15 +0000 Subject: net-memcg: Introduce mem_cgroup_sk_enabled(). The socket memcg feature is enabled by a static key and only works for non-root cgroup. We check both conditions in many places. Let's factorise it as a helper function. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-8-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/proto_memory.h | 2 +- include/net/sock.h | 10 ++++++++++ include/net/tcp.h | 2 +- net/core/sock.c | 6 +++--- net/ipv4/tcp_output.c | 2 +- 5 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h index a6ab2f4f5e28..859e63de81c4 100644 --- a/include/net/proto_memory.h +++ b/include/net/proto_memory.h @@ -31,7 +31,7 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) if (!sk->sk_prot->memory_pressure) return false; - if (mem_cgroup_sockets_enabled && sk->sk_memcg && + if (mem_cgroup_sk_enabled(sk) && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; diff --git a/include/net/sock.h b/include/net/sock.h index 811f95ea8d00..3efdf680401d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2599,11 +2599,21 @@ static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { return sk->sk_memcg; } + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk); +} #else static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { return NULL; } + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return false; +} #endif static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) diff --git a/include/net/tcp.h b/include/net/tcp.h index 526a26e7a150..9f01b6be6444 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -275,7 +275,7 @@ extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { - if (mem_cgroup_sockets_enabled && sk->sk_memcg && + if (mem_cgroup_sk_enabled(sk) && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; diff --git a/net/core/sock.c b/net/core/sock.c index 000940ecf360..ab658fe23e1e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1032,7 +1032,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) bool charged; int pages; - if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) + if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) @@ -3271,7 +3271,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) { + if (mem_cgroup_sk_enabled(sk)) { memcg = sk->sk_memcg; charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); if (!charged) @@ -3398,7 +3398,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { sk_memory_allocated_sub(sk, amount); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) + if (mem_cgroup_sk_enabled(sk)) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_global_memory_pressure(sk) && diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index caf11920a878..37fb320e6f70 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3578,7 +3578,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) + if (mem_cgroup_sk_enabled(sk)) mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } -- cgit v1.2.3 From b2ffd10cddde47cc6830e4981e91e3215def62b1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:17 +0000 Subject: net-memcg: Pass struct sock to mem_cgroup_sk_under_memory_pressure(). We will store a flag in the lowest bit of sk->sk_memcg. Then, we cannot pass the raw pointer to mem_cgroup_under_socket_pressure(). Let's pass struct sock to it and rename the function to match other functions starting with mem_cgroup_sk_. Note that the helper is moved to sock.h to use mem_cgroup_from_sk(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-10-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 18 ------------------ include/net/proto_memory.h | 2 +- include/net/sock.h | 22 ++++++++++++++++++++++ include/net/tcp.h | 2 +- 4 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0837d3de3a68..fb27e3d2fdac 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1642,19 +1642,6 @@ static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg) } #endif -static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) -{ -#ifdef CONFIG_MEMCG_V1 - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return !!memcg->tcpmem_pressure; -#endif /* CONFIG_MEMCG_V1 */ - do { - if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) - return true; - } while ((memcg = parent_mem_cgroup(memcg))); - return false; -} - int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); @@ -1686,11 +1673,6 @@ static inline void mem_cgroup_sk_uncharge(const struct sock *sk, { } -static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) -{ - return false; -} - static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h index 859e63de81c4..8e91a8fa31b5 100644 --- a/include/net/proto_memory.h +++ b/include/net/proto_memory.h @@ -32,7 +32,7 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) return false; if (mem_cgroup_sk_enabled(sk) && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) + mem_cgroup_sk_under_memory_pressure(sk)) return true; return !!READ_ONCE(*sk->sk_prot->memory_pressure); diff --git a/include/net/sock.h b/include/net/sock.h index 3efdf680401d..3bc4d566f7d0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2604,6 +2604,23 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk) { return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk); } + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + +#ifdef CONFIG_MEMCG_V1 + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return !!memcg->tcpmem_pressure; +#endif /* CONFIG_MEMCG_V1 */ + + do { + if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) + return true; + } while ((memcg = parent_mem_cgroup(memcg))); + + return false; +} #else static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { @@ -2614,6 +2631,11 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk) { return false; } + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + return false; +} #endif static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) diff --git a/include/net/tcp.h b/include/net/tcp.h index 9f01b6be6444..2936b8175950 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -276,7 +276,7 @@ extern unsigned long tcp_memory_pressure; static inline bool tcp_under_memory_pressure(const struct sock *sk) { if (mem_cgroup_sk_enabled(sk) && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) + mem_cgroup_sk_under_memory_pressure(sk)) return true; return READ_ONCE(tcp_memory_pressure); -- cgit v1.2.3 From bf64002c94fc330b996bc438f3d1b6bd3d781659 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:18 +0000 Subject: net: Define sk_memcg under CONFIG_MEMCG. Except for sk_clone_lock(), all accesses to sk->sk_memcg is done under CONFIG_MEMCG. As a bonus, let's define sk->sk_memcg under CONFIG_MEMCG. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-11-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 ++ net/core/sock.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 3bc4d566f7d0..1c49ea13af4a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -443,7 +443,9 @@ struct sock { __cacheline_group_begin(sock_read_rxtx); int sk_err; struct socket *sk_socket; +#ifdef CONFIG_MEMCG struct mem_cgroup *sk_memcg; +#endif #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif diff --git a/net/core/sock.c b/net/core/sock.c index 5537ca263858..ab6953d295df 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2512,8 +2512,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); +#ifdef CONFIG_MEMCG /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; +#endif cgroup_sk_clone(&newsk->sk_cgrp_data); @@ -4452,7 +4454,9 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); +#ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); +#endif CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); -- cgit v1.2.3 From bf40785fa437c1752117df2edb3220e9c37d98a6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:54:24 -0700 Subject: sctp: Use HMAC-SHA1 and HMAC-SHA256 library for chunk authentication For SCTP chunk authentication, use the HMAC-SHA1 and HMAC-SHA256 library functions instead of crypto_shash. This is simpler and faster. There's no longer any need to pre-allocate 'crypto_shash' objects; the SCTP code now simply calls into the HMAC code directly. As part of this, make SCTP always support both HMAC-SHA1 and HMAC-SHA256. Previously, it only guaranteed support for HMAC-SHA1. However, HMAC-SHA256 tended to be supported too anyway, as it was supported if CONFIG_CRYPTO_SHA256 was enabled elsewhere in the kconfig. Acked-by: Xin Long Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250818205426.30222-4-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/sctp/auth.h | 17 ++--- include/net/sctp/constants.h | 4 -- include/net/sctp/structs.h | 5 -- net/sctp/Kconfig | 16 +++-- net/sctp/auth.c | 166 ++++++++----------------------------------- net/sctp/chunk.c | 3 +- net/sctp/sm_make_chunk.c | 2 +- net/sctp/sm_statefuns.c | 2 +- net/sctp/socket.c | 10 --- 9 files changed, 48 insertions(+), 177 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index d4b3b2dcd15b..3d5879e08e78 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -22,16 +22,11 @@ struct sctp_endpoint; struct sctp_association; struct sctp_authkey; struct sctp_hmacalgo; -struct crypto_shash; -/* - * Define a generic struct that will hold all the info - * necessary for an HMAC transform - */ +/* Defines an HMAC algorithm supported by SCTP chunk authentication */ struct sctp_hmac { - __u16 hmac_id; /* one of the above ids */ - char *hmac_name; /* name for loading */ - __u16 hmac_len; /* length of the signature */ + __u16 hmac_id; /* one of SCTP_AUTH_HMAC_ID_* */ + __u16 hmac_len; /* length of the HMAC value in bytes */ }; /* This is generic structure that containst authentication bytes used @@ -78,9 +73,9 @@ int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, struct sctp_association *asoc, gfp_t gfp); int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp); -void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]); -struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); -struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); +const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); +const struct sctp_hmac * +sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs); int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 5859e0a16a58..8e0f4c4f7750 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -417,16 +417,12 @@ enum { SCTP_AUTH_HMAC_ID_RESERVED_0, SCTP_AUTH_HMAC_ID_SHA1, SCTP_AUTH_HMAC_ID_RESERVED_2, -#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE) SCTP_AUTH_HMAC_ID_SHA256, -#endif __SCTP_AUTH_HMAC_MAX }; #define SCTP_AUTH_HMAC_ID_MAX __SCTP_AUTH_HMAC_MAX - 1 #define SCTP_AUTH_NUM_HMACS __SCTP_AUTH_HMAC_MAX -#define SCTP_SHA1_SIG_SIZE 20 -#define SCTP_SHA256_SIG_SIZE 32 /* SCTP-AUTH, Section 3.2 * The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH chunks diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 8a540ad9b509..6be6aec25731 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1329,11 +1329,6 @@ struct sctp_endpoint { /* rcvbuf acct. policy. */ __u32 rcvbuf_policy; - /* SCTP AUTH: array of the HMACs that will be allocated - * we need this per association so that we don't serialize - */ - struct crypto_shash **auth_hmacs; - /* SCTP-AUTH: hmacs for the endpoint encoded into parameter */ struct sctp_hmac_algo_param *auth_hmacs_list; diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 24d5a35ce894..09c77b4d161b 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -7,9 +7,9 @@ menuconfig IP_SCTP tristate "The SCTP Protocol" depends on INET depends on IPV6 || IPV6=n - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS select NET_CRC32C select NET_UDP_TUNNEL help @@ -79,15 +79,17 @@ config SCTP_COOKIE_HMAC_MD5 bool "Enable optional MD5 hmac cookie generation" help Enable optional MD5 hmac based SCTP cookie generation - select CRYPTO_HMAC if SCTP_COOKIE_HMAC_MD5 - select CRYPTO_MD5 if SCTP_COOKIE_HMAC_MD5 + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 config SCTP_COOKIE_HMAC_SHA1 bool "Enable optional SHA1 hmac cookie generation" help Enable optional SHA1 hmac based SCTP cookie generation - select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1 - select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1 + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_SHA1 config INET_SCTP_DIAG depends on INET_DIAG diff --git a/net/sctp/auth.c b/net/sctp/auth.c index c58fffc86a0c..82aad477590e 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -12,36 +12,37 @@ * Vlad Yasevich */ -#include +#include +#include #include #include -#include #include #include -static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { +static const struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { { /* id 0 is reserved. as all 0 */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA1, - .hmac_name = "hmac(sha1)", - .hmac_len = SCTP_SHA1_SIG_SIZE, + .hmac_len = SHA1_DIGEST_SIZE, }, { /* id 2 is reserved as well */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, }, -#if IS_ENABLED(CONFIG_CRYPTO_SHA256) { .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, - .hmac_name = "hmac(sha256)", - .hmac_len = SCTP_SHA256_SIG_SIZE, + .hmac_len = SHA256_DIGEST_SIZE, } -#endif }; +static bool sctp_hmac_supported(__u16 hmac_id) +{ + return hmac_id < ARRAY_SIZE(sctp_hmac_list) && + sctp_hmac_list[hmac_id].hmac_len != 0; +} void sctp_auth_key_put(struct sctp_auth_bytes *key) { @@ -444,76 +445,7 @@ struct sctp_shared_key *sctp_auth_get_shkey( return NULL; } -/* - * Initialize all the possible digest transforms that we can use. Right - * now, the supported digests are SHA1 and SHA256. We do this here once - * because of the restrictiong that transforms may only be allocated in - * user context. This forces us to pre-allocated all possible transforms - * at the endpoint init time. - */ -int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) -{ - struct crypto_shash *tfm = NULL; - __u16 id; - - /* If the transforms are already allocated, we are done */ - if (ep->auth_hmacs) - return 0; - - /* Allocated the array of pointers to transorms */ - ep->auth_hmacs = kcalloc(SCTP_AUTH_NUM_HMACS, - sizeof(struct crypto_shash *), - gfp); - if (!ep->auth_hmacs) - return -ENOMEM; - - for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) { - - /* See is we support the id. Supported IDs have name and - * length fields set, so that we can allocated and use - * them. We can safely just check for name, for without the - * name, we can't allocate the TFM. - */ - if (!sctp_hmac_list[id].hmac_name) - continue; - - /* If this TFM has been allocated, we are all set */ - if (ep->auth_hmacs[id]) - continue; - - /* Allocate the ID */ - tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0); - if (IS_ERR(tfm)) - goto out_err; - - ep->auth_hmacs[id] = tfm; - } - - return 0; - -out_err: - /* Clean up any successful allocations */ - sctp_auth_destroy_hmacs(ep->auth_hmacs); - ep->auth_hmacs = NULL; - return -ENOMEM; -} - -/* Destroy the hmac tfm array */ -void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]) -{ - int i; - - if (!auth_hmacs) - return; - - for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) { - crypto_free_shash(auth_hmacs[i]); - } - kfree(auth_hmacs); -} - - -struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) +const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) { return &sctp_hmac_list[hmac_id]; } @@ -521,7 +453,8 @@ struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) /* Get an hmac description information that we can use to build * the AUTH chunk */ -struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) +const struct sctp_hmac * +sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; @@ -543,26 +476,10 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); - - /* Check the id is in the supported range. And - * see if we support the id. Supported IDs have name and - * length fields set, so that we can allocate and use - * them. We can safely just check for name, for without the - * name, we can't allocate the TFM. - */ - if (id > SCTP_AUTH_HMAC_ID_MAX || - !sctp_hmac_list[id].hmac_name) { - id = 0; - continue; - } - - break; + if (sctp_hmac_supported(id)) + return &sctp_hmac_list[id]; } - - if (id == 0) - return NULL; - - return &sctp_hmac_list[id]; + return NULL; } static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id) @@ -606,7 +523,6 @@ int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs) { - struct sctp_endpoint *ep; __u16 id; int i; int n_params; @@ -617,16 +533,9 @@ void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, n_params = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; - ep = asoc->ep; for (i = 0; i < n_params; i++) { id = ntohs(hmacs->hmac_ids[i]); - - /* Check the id is in the supported range */ - if (id > SCTP_AUTH_HMAC_ID_MAX) - continue; - - /* If this TFM has been allocated, use this id */ - if (ep->auth_hmacs[id]) { + if (sctp_hmac_supported(id)) { asoc->default_hmac_id = id; break; } @@ -709,10 +618,9 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *asoc_key; - struct crypto_shash *tfm; __u16 key_id, hmac_id; - unsigned char *end; int free_key = 0; + size_t data_len; __u8 *digest; /* Extract the info we need: @@ -733,19 +641,17 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, free_key = 1; } - /* set up scatter list */ - end = skb_tail_pointer(skb); - - tfm = asoc->ep->auth_hmacs[hmac_id]; - + data_len = skb_tail_pointer(skb) - (unsigned char *)auth; digest = (u8 *)(&auth->auth_hdr + 1); - if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len)) - goto free; - - crypto_shash_tfm_digest(tfm, (u8 *)auth, end - (unsigned char *)auth, - digest); + if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) { + hmac_sha1_usingrawkey(asoc_key->data, asoc_key->len, + (const u8 *)auth, data_len, digest); + } else { + WARN_ON_ONCE(hmac_id != SCTP_AUTH_HMAC_ID_SHA256); + hmac_sha256_usingrawkey(asoc_key->data, asoc_key->len, + (const u8 *)auth, data_len, digest); + } -free: if (free_key) sctp_auth_key_put(asoc_key); } @@ -788,14 +694,11 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, for (i = 0; i < hmacs->shmac_num_idents; i++) { id = hmacs->shmac_idents[i]; - if (id > SCTP_AUTH_HMAC_ID_MAX) + if (!sctp_hmac_supported(id)) return -EOPNOTSUPP; if (SCTP_AUTH_HMAC_ID_SHA1 == id) has_sha1 = 1; - - if (!sctp_hmac_list[id].hmac_name) - return -EOPNOTSUPP; } if (!has_sha1) @@ -1021,8 +924,6 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep, int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) { - int err = -ENOMEM; - /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. @@ -1060,13 +961,6 @@ int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) ep->auth_chunk_list = auth_chunks; } - /* Allocate and initialize transorms arrays for supported - * HMACs. - */ - err = sctp_auth_init_hmacs(ep, gfp); - if (err) - goto nomem; - return 0; nomem: @@ -1075,7 +969,7 @@ nomem: kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; - return err; + return -ENOMEM; } void sctp_auth_free(struct sctp_endpoint *ep) @@ -1084,6 +978,4 @@ void sctp_auth_free(struct sctp_endpoint *ep) kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; - sctp_auth_destroy_hmacs(ep->auth_hmacs); - ep->auth_hmacs = NULL; } diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index fd4f8243cc35..c655b571ca01 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -184,7 +184,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { - struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); + const struct sctp_hmac *hmac_desc = + sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index d099b605e44a..a1a3c8494c5d 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1320,7 +1320,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; - struct sctp_hmac *hmac_desc; + const struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d4d5b14b49b3..4cb8f393434d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4362,7 +4362,7 @@ static enum sctp_ierror sctp_sf_authenticate( struct sctp_shared_key *sh_key = NULL; struct sctp_authhdr *auth_hdr; __u8 *save_digest, *digest; - struct sctp_hmac *hmac; + const struct sctp_hmac *hmac; unsigned int sig_len; __u16 key_id; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 4921416434f9..0292881a847c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9581,16 +9581,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, if (err) return err; - /* New ep's auth_hmacs should be set if old ep's is set, in case - * that net->sctp.auth_enable has been changed to 0 by users and - * new ep's auth_hmacs couldn't be set in sctp_endpoint_init(). - */ - if (oldsp->ep->auth_hmacs) { - err = sctp_auth_init_hmacs(newsp->ep, GFP_KERNEL); - if (err) - return err; - } - sctp_auto_asconf_init(newsp); /* Move any messages in the old socket's receive queue that are for the -- cgit v1.2.3 From 2f3dd6ec901f29aef5fff3d7a63b1371d67c1760 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:54:25 -0700 Subject: sctp: Convert cookie authentication to use HMAC-SHA256 Convert SCTP cookies to use HMAC-SHA256, instead of the previous choice of the legacy algorithms HMAC-MD5 and HMAC-SHA1. Simplify and optimize the code by using the HMAC-SHA256 library instead of crypto_shash, and by preparing the HMAC key when it is generated instead of per-operation. This doesn't break compatibility, since the cookie format is an implementation detail, not part of the SCTP protocol itself. Note that the cookie size doesn't change either. The HMAC field was already 32 bytes, even though previously at most 20 bytes were actually compared. 32 bytes exactly fits an untruncated HMAC-SHA256 value. So, although we could safely truncate the MAC to something slightly shorter, for now just keep the cookie size the same. I also considered SipHash, but that would generate only 8-byte MACs. An 8-byte MAC *might* suffice here. However, there's quite a lot of information in the SCTP cookies: more than in TCP SYN cookies. So absent an analysis that occasional forgeries of all that information is okay in SCTP, I errored on the side of caution. Remove HMAC-MD5 and HMAC-SHA1 as options, since the new HMAC-SHA256 option is just better. It's faster as well as more secure. For example, benchmarking on x86_64, cookie authentication is now nearly 3x as fast as the previous default choice and implementation of HMAC-MD5. Also just make the kernel always support cookie authentication if SCTP is supported at all, rather than making it optional in the build. (It was sort of optional before, but it didn't really work properly. E.g., a kernel with CONFIG_SCTP_COOKIE_HMAC_MD5=n still supported HMAC-MD5 cookie authentication if CONFIG_CRYPTO_HMAC and CONFIG_CRYPTO_MD5 happened to be enabled in the kconfig for other reasons.) Acked-by: Xin Long Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250818205426.30222-5-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 11 +++---- include/net/netns/sctp.h | 4 +-- include/net/sctp/constants.h | 5 ++- include/net/sctp/structs.h | 30 +++++------------- net/sctp/Kconfig | 43 ++++++------------------- net/sctp/endpointola.c | 23 +++++++------- net/sctp/protocol.c | 11 ++----- net/sctp/sm_make_chunk.c | 57 ++++++++++------------------------ net/sctp/socket.c | 31 +----------------- net/sctp/sysctl.c | 51 +++++++++++++----------------- 10 files changed, 79 insertions(+), 187 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 9756d16e3df1..3d6782683eee 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -3508,16 +3508,13 @@ cookie_hmac_alg - STRING a listening sctp socket to a connecting client in the INIT-ACK chunk. Valid values are: - * md5 - * sha1 + * sha256 * none - Ability to assign md5 or sha1 as the selected alg is predicated on the - configuration of those algorithms at build time (CONFIG_CRYPTO_MD5 and - CONFIG_CRYPTO_SHA1). + md5 and sha1 are also accepted for backwards compatibility, but cause + sha256 to be selected. - Default: Dependent on configuration. MD5 if available, else SHA1 if - available, else none. + Default: sha256 rcvbuf_policy - INTEGER Determines if the receive buffer is attributed to the socket or to diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index d25cd7a9c5ff..c0f97f36389e 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -75,8 +75,8 @@ struct netns_sctp { /* Whether Cookie Preservative is enabled(1) or not(0) */ int cookie_preserve_enable; - /* The namespace default hmac alg */ - char *sctp_hmac_alg; + /* Whether cookie authentication is enabled(1) or not(0) */ + int cookie_auth_enable; /* Valid.Cookie.Life - 60 seconds */ unsigned int valid_cookie_life; diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 8e0f4c4f7750..ae3376ba0b99 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -296,9 +296,8 @@ enum { SCTP_MAX_GABS = 16 }; */ #define SCTP_DEFAULT_MINSEGMENT 512 /* MTU size ... if no mtu disc */ -#define SCTP_SECRET_SIZE 32 /* Number of octets in a 256 bits. */ - -#define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */ +#define SCTP_COOKIE_KEY_SIZE 32 /* size of cookie HMAC key */ +#define SCTP_COOKIE_MAC_SIZE 32 /* size of HMAC field in cookies */ #define SCTP_COOKIE_MULTIPLE 32 /* Pad out our cookie to make our hash * functions simpler to write. diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 6be6aec25731..2ae390219efd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -32,6 +32,7 @@ #ifndef __sctp_structs_h__ #define __sctp_structs_h__ +#include #include #include #include @@ -68,7 +69,6 @@ struct sctp_outq; struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; -struct crypto_shash; struct sctp_stream; @@ -155,10 +155,6 @@ struct sctp_sock { /* PF_ family specific functions. */ struct sctp_pf *pf; - /* Access to HMAC transform. */ - struct crypto_shash *hmac; - char *sctp_hmac_alg; - /* What is our base endpointer? */ struct sctp_endpoint *ep; @@ -227,7 +223,8 @@ struct sctp_sock { frag_interleave:1, recvrcvinfo:1, recvnxtinfo:1, - data_ready_signalled:1; + data_ready_signalled:1, + cookie_auth_enable:1; atomic_t pd_mode; @@ -335,7 +332,7 @@ struct sctp_cookie { /* The format of our cookie that we send to our peer. */ struct sctp_signed_cookie { - __u8 signature[SCTP_SECRET_SIZE]; + __u8 mac[SCTP_COOKIE_MAC_SIZE]; __u32 __pad; /* force sctp_cookie alignment to 64 bits */ struct sctp_cookie c; } __packed; @@ -1307,22 +1304,9 @@ struct sctp_endpoint { /* This is really a list of struct sctp_association entries. */ struct list_head asocs; - /* Secret Key: A secret key used by this endpoint to compute - * the MAC. This SHOULD be a cryptographic quality - * random number with a sufficient length. - * Discussion in [RFC1750] can be helpful in - * selection of the key. - */ - __u8 secret_key[SCTP_SECRET_SIZE]; - - /* digest: This is a digest of the sctp cookie. This field is - * only used on the receive path when we try to validate - * that the cookie has not been tampered with. We put - * this here so we pre-allocate this once and can re-use - * on every receive. - */ - __u8 *digest; - + /* Cookie authentication key used by this endpoint */ + struct hmac_sha256_key cookie_auth_key; + /* sendbuf acct. policy. */ __u32 sndbuf_policy; diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 09c77b4d161b..e947646a380c 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -49,48 +49,25 @@ config SCTP_DBG_OBJCNT 'cat /proc/net/sctp/sctp_dbg_objcnt' If unsure, say N + choice - prompt "Default SCTP cookie HMAC encoding" - default SCTP_DEFAULT_COOKIE_HMAC_MD5 + prompt "Default SCTP cookie authentication method" + default SCTP_DEFAULT_COOKIE_HMAC_SHA256 help - This option sets the default sctp cookie hmac algorithm - when in doubt select 'md5' + This option sets the default SCTP cookie authentication method, for + when a method hasn't been explicitly selected via the + net.sctp.cookie_hmac_alg sysctl. -config SCTP_DEFAULT_COOKIE_HMAC_MD5 - bool "Enable optional MD5 hmac cookie generation" - help - Enable optional MD5 hmac based SCTP cookie generation - select SCTP_COOKIE_HMAC_MD5 + If unsure, choose the default (HMAC-SHA256). -config SCTP_DEFAULT_COOKIE_HMAC_SHA1 - bool "Enable optional SHA1 hmac cookie generation" - help - Enable optional SHA1 hmac based SCTP cookie generation - select SCTP_COOKIE_HMAC_SHA1 +config SCTP_DEFAULT_COOKIE_HMAC_SHA256 + bool "HMAC-SHA256" config SCTP_DEFAULT_COOKIE_HMAC_NONE - bool "Use no hmac alg in SCTP cookie generation" - help - Use no hmac algorithm in SCTP cookie generation + bool "None" endchoice -config SCTP_COOKIE_HMAC_MD5 - bool "Enable optional MD5 hmac cookie generation" - help - Enable optional MD5 hmac based SCTP cookie generation - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_MD5 - -config SCTP_COOKIE_HMAC_SHA1 - bool "Enable optional SHA1 hmac cookie generation" - help - Enable optional SHA1 hmac based SCTP cookie generation - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 - config INET_SCTP_DIAG depends on INET_DIAG def_tristate INET_DIAG diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 7e77b450697c..31e989dfe846 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -35,6 +35,15 @@ /* Forward declarations for internal helpers. */ static void sctp_endpoint_bh_rcv(struct work_struct *work); +static void gen_cookie_auth_key(struct hmac_sha256_key *key) +{ + u8 raw_key[SCTP_COOKIE_KEY_SIZE]; + + get_random_bytes(raw_key, sizeof(raw_key)); + hmac_sha256_preparekey(key, raw_key, sizeof(raw_key)); + memzero_explicit(raw_key, sizeof(raw_key)); +} + /* * Initialize the base fields of the endpoint structure. */ @@ -45,10 +54,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct net *net = sock_net(sk); struct sctp_shared_key *null_key; - ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp); - if (!ep->digest) - return NULL; - ep->asconf_enable = net->sctp.addip_enable; ep->auth_enable = net->sctp.auth_enable; if (ep->auth_enable) { @@ -90,8 +95,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Get the receive buffer policy for this endpoint */ ep->rcvbuf_policy = net->sctp.rcvbuf_policy; - /* Initialize the secret key used with cookie. */ - get_random_bytes(ep->secret_key, sizeof(ep->secret_key)); + /* Generate the cookie authentication key. */ + gen_cookie_auth_key(&ep->cookie_auth_key); /* SCTP-AUTH extensions*/ INIT_LIST_HEAD(&ep->endpoint_shared_keys); @@ -118,7 +123,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, nomem_shkey: sctp_auth_free(ep); nomem: - kfree(ep->digest); return NULL; } @@ -205,9 +209,6 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) return; } - /* Free the digest buffer */ - kfree(ep->digest); - /* SCTP-AUTH: Free up AUTH releated data such as shared keys * chunks and hmacs arrays that were allocated */ @@ -218,7 +219,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) sctp_inq_free(&ep->base.inqueue); sctp_bind_addr_free(&ep->base.bind_addr); - memset(ep->secret_key, 0, sizeof(ep->secret_key)); + memzero_explicit(&ep->cookie_auth_key, sizeof(ep->cookie_auth_key)); sk = ep->base.sk; /* Remove and free the port */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index a5ccada55f2b..3b2373b3bd5d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1334,14 +1334,9 @@ static int __net_init sctp_defaults_init(struct net *net) /* Whether Cookie Preservative is enabled(1) or not(0) */ net->sctp.cookie_preserve_enable = 1; - /* Default sctp sockets to use md5 as their hmac alg */ -#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) - net->sctp.sctp_hmac_alg = "md5"; -#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1) - net->sctp.sctp_hmac_alg = "sha1"; -#else - net->sctp.sctp_hmac_alg = NULL; -#endif + /* Whether cookie authentication is enabled(1) or not(0) */ + net->sctp.cookie_auth_enable = + !IS_ENABLED(CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE); /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index a1a3c8494c5d..2c0017d058d4 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -30,7 +30,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include @@ -1675,8 +1674,10 @@ static struct sctp_cookie_param *sctp_pack_cookie( * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); - if (!retval) - goto nodata; + if (!retval) { + *cookie_len = 0; + return NULL; + } cookie = (struct sctp_signed_cookie *) retval->body; @@ -1707,26 +1708,14 @@ static struct sctp_cookie_param *sctp_pack_cookie( memcpy((__u8 *)(cookie + 1) + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); - if (sctp_sk(ep->base.sk)->hmac) { - struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; - int err; - - /* Sign the message. */ - err = crypto_shash_setkey(tfm, ep->secret_key, - sizeof(ep->secret_key)) ?: - crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, - cookie->signature); - if (err) - goto free_cookie; + /* Sign the cookie, if cookie authentication is enabled. */ + if (sctp_sk(ep->base.sk)->cookie_auth_enable) { + static_assert(sizeof(cookie->mac) == SHA256_DIGEST_SIZE); + hmac_sha256(&ep->cookie_auth_key, (const u8 *)&cookie->c, + bodysize, cookie->mac); } return retval; - -free_cookie: - kfree(retval); -nodata: - *cookie_len = 0; - return NULL; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ @@ -1741,7 +1730,6 @@ struct sctp_association *sctp_unpack_cookie( struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; - __u8 *digest = ep->digest; enum sctp_scope scope; unsigned int len; ktime_t kt; @@ -1771,30 +1759,19 @@ struct sctp_association *sctp_unpack_cookie( cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; - if (!sctp_sk(ep->base.sk)->hmac) - goto no_hmac; + /* Verify the cookie's MAC, if cookie authentication is enabled. */ + if (sctp_sk(ep->base.sk)->cookie_auth_enable) { + u8 mac[SHA256_DIGEST_SIZE]; - /* Check the signature. */ - { - struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; - int err; - - err = crypto_shash_setkey(tfm, ep->secret_key, - sizeof(ep->secret_key)) ?: - crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, - digest); - if (err) { - *error = -SCTP_IERROR_NOMEM; + hmac_sha256(&ep->cookie_auth_key, (const u8 *)bear_cookie, + bodysize, mac); + static_assert(sizeof(cookie->mac) == sizeof(mac)); + if (crypto_memneq(mac, cookie->mac, sizeof(mac))) { + *error = -SCTP_IERROR_BAD_SIG; goto fail; } } - if (crypto_memneq(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { - *error = -SCTP_IERROR_BAD_SIG; - goto fail; - } - -no_hmac: /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0292881a847c..ed8293a34240 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -37,7 +37,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include @@ -4987,7 +4986,7 @@ static int sctp_init_sock(struct sock *sk) sp->default_rcv_context = 0; sp->max_burst = net->sctp.max_burst; - sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg; + sp->cookie_auth_enable = net->sctp.cookie_auth_enable; /* Initialize default setup parameters. These parameters * can be modified with the SCTP_INITMSG socket option or @@ -5079,8 +5078,6 @@ static int sctp_init_sock(struct sock *sk) if (!sp->ep) return -ENOMEM; - sp->hmac = NULL; - sk->sk_destruct = sctp_destruct_sock; SCTP_DBG_OBJCNT_INC(sock); @@ -5117,18 +5114,8 @@ static void sctp_destroy_sock(struct sock *sk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } -/* Triggered when there are no references on the socket anymore */ -static void sctp_destruct_common(struct sock *sk) -{ - struct sctp_sock *sp = sctp_sk(sk); - - /* Free up the HMAC transform. */ - crypto_free_shash(sp->hmac); -} - static void sctp_destruct_sock(struct sock *sk) { - sctp_destruct_common(sk); inet_sock_destruct(sk); } @@ -8530,22 +8517,8 @@ static int sctp_listen_start(struct sock *sk, int backlog) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; - struct crypto_shash *tfm = NULL; - char alg[32]; int err; - /* Allocate HMAC for generating cookie. */ - if (!sp->hmac && sp->sctp_hmac_alg) { - sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg); - tfm = crypto_alloc_shash(alg, 0, 0); - if (IS_ERR(tfm)) { - net_info_ratelimited("failed to load transform for %s: %ld\n", - sp->sctp_hmac_alg, PTR_ERR(tfm)); - return -ENOSYS; - } - sctp_sk(sk)->hmac = tfm; - } - /* * If a bind() or sctp_bindx() is not called prior to a listen() * call that allows new associations to be accepted, the system @@ -9561,7 +9534,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, * copy. */ newsp->ep = newep; - newsp->hmac = NULL; /* Hook this new socket in to the bind_hash list. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk), @@ -9713,7 +9685,6 @@ struct proto sctp_prot = { static void sctp_v6_destruct_sock(struct sock *sk) { - sctp_destruct_common(sk); inet6_sock_destruct(sk); } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index ee3eac338a9d..19acc57c3ed9 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -174,7 +174,7 @@ static struct ctl_table sctp_net_table[] = { }, { .procname = "cookie_hmac_alg", - .data = &init_net.sctp.sctp_hmac_alg, + .data = &init_net.sctp.cookie_auth_enable, .maxlen = 8, .mode = 0644, .proc_handler = proc_sctp_do_hmac_alg, @@ -388,10 +388,8 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, - sctp.sctp_hmac_alg); + sctp.cookie_auth_enable); struct ctl_table tbl; - bool changed = false; - char *none = "none"; char tmp[8] = {0}; int ret; @@ -399,35 +397,28 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, if (write) { tbl.data = tmp; - tbl.maxlen = sizeof(tmp); - } else { - tbl.data = net->sctp.sctp_hmac_alg ? : none; - tbl.maxlen = strlen(tbl.data); - } - - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); - if (write && ret == 0) { -#ifdef CONFIG_CRYPTO_MD5 - if (!strncmp(tmp, "md5", 3)) { - net->sctp.sctp_hmac_alg = "md5"; - changed = true; + tbl.maxlen = sizeof(tmp) - 1; + ret = proc_dostring(&tbl, 1, buffer, lenp, ppos); + if (ret) + return ret; + if (!strcmp(tmp, "sha256") || + /* for backwards compatibility */ + !strcmp(tmp, "md5") || !strcmp(tmp, "sha1")) { + net->sctp.cookie_auth_enable = 1; + return 0; } -#endif -#ifdef CONFIG_CRYPTO_SHA1 - if (!strncmp(tmp, "sha1", 4)) { - net->sctp.sctp_hmac_alg = "sha1"; - changed = true; + if (!strcmp(tmp, "none")) { + net->sctp.cookie_auth_enable = 0; + return 0; } -#endif - if (!strncmp(tmp, "none", 4)) { - net->sctp.sctp_hmac_alg = NULL; - changed = true; - } - if (!changed) - ret = -EINVAL; + return -EINVAL; } - - return ret; + if (net->sctp.cookie_auth_enable) + tbl.data = (char *)"sha256"; + else + tbl.data = (char *)"none"; + tbl.maxlen = strlen(tbl.data); + return proc_dostring(&tbl, 0, buffer, lenp, ppos); } static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, -- cgit v1.2.3 From a6d4f25888b83b8300aef28d9ee22765c1cc9b34 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Aug 2025 17:40:30 +0000 Subject: net: set net.core.rmem_max and net.core.wmem_max to 4 MB SO_RCVBUF and SO_SNDBUF have limited range today, unless distros or system admins change rmem_max and wmem_max. Even iproute2 uses 1 MB SO_RCVBUF which is capped by the kernel. Decouple [rw]mem_max and [rw]mem_default and increase [rw]mem_max to 4 MB. Before: $ sysctl net.core.rmem_default net.core.rmem_max net.core.wmem_default net.core.wmem_max net.core.rmem_default = 212992 net.core.rmem_max = 212992 net.core.wmem_default = 212992 net.core.wmem_max = 212992 After: $ sysctl net.core.rmem_default net.core.rmem_max net.core.wmem_default net.core.wmem_max net.core.rmem_default = 212992 net.core.rmem_max = 4194304 net.core.wmem_default = 212992 net.core.wmem_max = 4194304 Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250819174030.1986278-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/admin-guide/sysctl/net.rst | 4 ++++ Documentation/networking/ip-sysctl.rst | 6 +++--- include/net/sock.h | 4 ++-- net/core/sock.c | 8 ++++---- net/ipv4/arp.c | 2 +- net/ipv6/ndisc.c | 2 +- 6 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 7b0c4291c686..2ef50828aff1 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -222,6 +222,8 @@ rmem_max The maximum receive socket buffer size in bytes. +Default: 4194304 + rps_default_mask ---------------- @@ -247,6 +249,8 @@ wmem_max The maximum send socket buffer size in bytes. +Default: 4194304 + message_burst and message_cost ------------------------------ diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 43badb338d22..9f5891c9b07b 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -209,7 +209,7 @@ neigh/default/unres_qlen_bytes - INTEGER Setting negative value is meaningless and will return error. - Default: SK_WMEM_MAX, (same as net.core.wmem_default). + Default: SK_WMEM_DEFAULT, (same as net.core.wmem_default). Exact value depends on architecture and kernel options, but should be enough to allow queuing 256 packets @@ -805,8 +805,8 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max This value results in initial window of 65535. max: maximal size of receive buffer allowed for automatically - selected receiver buffers for TCP socket. This value does not override - net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables + selected receiver buffers for TCP socket. + Calling setsockopt() with SO_RCVBUF disables automatic tuning of that socket's receive buffer size, in which case this value is ignored. Default: between 131072 and 32MB, depending on RAM size. diff --git a/include/net/sock.h b/include/net/sock.h index 1c49ea13af4a..63a6a48afb48 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2970,8 +2970,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_WMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; diff --git a/net/core/sock.c b/net/core/sock.c index ab6953d295df..8002ac6293dc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -281,12 +281,12 @@ static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ -__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +__u32 sysctl_wmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_wmem_max); -__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +__u32 sysctl_rmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_rmem_max); -__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; -__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 5cfc1c939673..833f2cf97178 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -170,7 +170,7 @@ struct neigh_table arp_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7d5abb3158ec..57aaa7ae8ac3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -130,7 +130,7 @@ struct neigh_table nd_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, -- cgit v1.2.3 From b64d035f77b1f02ab449393342264b44950a75ae Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Aug 2025 06:19:58 +0000 Subject: bonding: update LACP activity flag after setting lacp_active The port's actor_oper_port_state activity flag should be updated immediately after changing the lacp_active option to reflect the current mode correctly. Fixes: 3a755cd8b7c6 ("bonding: add new option lacp_active") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250815062000.22220-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_3ad.c | 25 +++++++++++++++++++++++++ drivers/net/bonding/bond_options.c | 1 + include/net/bond_3ad.h | 1 + 3 files changed, 27 insertions(+) (limited to 'include/net') diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 2fca8e84ab10..414fecfd2a0e 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2883,6 +2883,31 @@ void bond_3ad_update_lacp_rate(struct bonding *bond) spin_unlock_bh(&bond->mode_lock); } +/** + * bond_3ad_update_lacp_active - change the lacp active + * @bond: bonding struct + * + * Update actor_oper_port_state when lacp_active is modified. + */ +void bond_3ad_update_lacp_active(struct bonding *bond) +{ + struct port *port = NULL; + struct list_head *iter; + struct slave *slave; + int lacp_active; + + lacp_active = bond->params.lacp_active; + spin_lock_bh(&bond->mode_lock); + bond_for_each_slave(bond, slave, iter) { + port = &(SLAVE_AD_INFO(slave)->port); + if (lacp_active) + port->actor_oper_port_state |= LACP_STATE_LACP_ACTIVITY; + else + port->actor_oper_port_state &= ~LACP_STATE_LACP_ACTIVITY; + } + spin_unlock_bh(&bond->mode_lock); +} + size_t bond_3ad_stats_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */ diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 1d639a3be6ba..3b6f815c55ff 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1660,6 +1660,7 @@ static int bond_option_lacp_active_set(struct bonding *bond, netdev_dbg(bond->dev, "Setting LACP active to %s (%llu)\n", newval->string, newval->value); bond->params.lacp_active = newval->value; + bond_3ad_update_lacp_active(bond); return 0; } diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index 2053cd8e788a..dba369a2cf27 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -307,6 +307,7 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); int bond_3ad_set_carrier(struct bonding *bond); void bond_3ad_update_lacp_rate(struct bonding *bond); +void bond_3ad_update_lacp_active(struct bonding *bond); void bond_3ad_update_ad_actor_settings(struct bonding *bond); int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats); size_t bond_3ad_stats_size(void); -- cgit v1.2.3 From 07cf71bf25cd4e5735ff13468e7b86f02c3665cb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Aug 2025 19:56:50 -0700 Subject: net: page_pool: add page_pool_get() There is a page_pool_put() function but no get equivalent. Having multiple references to a page pool is quite useful. It avoids branching in create / destroy paths in drivers which support memory providers. Use the new helper in bnxt. Acked-by: Jesper Dangaard Brouer Reviewed-by: Dragos Tatulea Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250820025704.166248-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 11 +++++------ include/net/page_pool/helpers.h | 5 +++++ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index ba99de403138..14fa5b9e0f6c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3800,8 +3800,7 @@ static void bnxt_free_rx_rings(struct bnxt *bp) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - if (bnxt_separate_head_pool(rxr)) - page_pool_destroy(rxr->head_pool); + page_pool_destroy(rxr->head_pool); rxr->page_pool = rxr->head_pool = NULL; kfree(rxr->rx_agg_bmap); @@ -3848,6 +3847,8 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, pool = page_pool_create(&pp); if (IS_ERR(pool)) goto err_destroy_pp; + } else { + page_pool_get(pool); } rxr->head_pool = pool; @@ -15903,8 +15904,7 @@ err_rxq_info_unreg: xdp_rxq_info_unreg(&clone->xdp_rxq); err_page_pool_destroy: page_pool_destroy(clone->page_pool); - if (bnxt_separate_head_pool(clone)) - page_pool_destroy(clone->head_pool); + page_pool_destroy(clone->head_pool); clone->page_pool = NULL; clone->head_pool = NULL; return rc; @@ -15922,8 +15922,7 @@ static void bnxt_queue_mem_free(struct net_device *dev, void *qmem) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - if (bnxt_separate_head_pool(rxr)) - page_pool_destroy(rxr->head_pool); + page_pool_destroy(rxr->head_pool); rxr->page_pool = NULL; rxr->head_pool = NULL; diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index db180626be06..aa3719f28216 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -489,6 +489,11 @@ page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, offset, dma_sync_size); } +static inline void page_pool_get(struct page_pool *pool) +{ + refcount_inc(&pool->user_cnt); +} + static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); -- cgit v1.2.3 From 6bbd0d3f0c23fc53c17409dd7476f38ae0ff0cd9 Mon Sep 17 00:00:00 2001 From: Pavel Shpakovskiy Date: Fri, 22 Aug 2025 12:20:55 +0300 Subject: Bluetooth: hci_sync: fix set_local_name race condition Function set_name_sync() uses hdev->dev_name field to send HCI_OP_WRITE_LOCAL_NAME command, but copying from data to hdev->dev_name is called after mgmt cmd was queued, so it is possible that function set_name_sync() will read old name value. This change adds name as a parameter for function hci_update_name_sync() to avoid race condition. Fixes: 6f6ff38a1e14 ("Bluetooth: hci_sync: Convert MGMT_OP_SET_LOCAL_NAME") Signed-off-by: Pavel Shpakovskiy Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_sync.h | 2 +- net/bluetooth/hci_sync.c | 6 +++--- net/bluetooth/mgmt.c | 5 ++++- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 5224f57f6af2..e352a4e0ef8d 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -93,7 +93,7 @@ int hci_update_class_sync(struct hci_dev *hdev); int hci_update_eir_sync(struct hci_dev *hdev); int hci_update_class_sync(struct hci_dev *hdev); -int hci_update_name_sync(struct hci_dev *hdev); +int hci_update_name_sync(struct hci_dev *hdev, const u8 *name); int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode); int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 31d72b9683ef..b6f888d8354e 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3481,13 +3481,13 @@ int hci_update_scan_sync(struct hci_dev *hdev) return hci_write_scan_enable_sync(hdev, scan); } -int hci_update_name_sync(struct hci_dev *hdev) +int hci_update_name_sync(struct hci_dev *hdev, const u8 *name) { struct hci_cp_write_local_name cp; memset(&cp, 0, sizeof(cp)); - memcpy(cp.name, hdev->dev_name, sizeof(cp.name)); + memcpy(cp.name, name, sizeof(cp.name)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp, @@ -3540,7 +3540,7 @@ int hci_powered_update_sync(struct hci_dev *hdev) hci_write_fast_connectable_sync(hdev, false); hci_update_scan_sync(hdev); hci_update_class_sync(hdev); - hci_update_name_sync(hdev); + hci_update_name_sync(hdev, hdev->dev_name); hci_update_eir_sync(hdev); } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 90e37ff2c85d..50634ef5c8b7 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3892,8 +3892,11 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) static int set_name_sync(struct hci_dev *hdev, void *data) { + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_set_local_name *cp = cmd->param; + if (lmp_bredr_capable(hdev)) { - hci_update_name_sync(hdev); + hci_update_name_sync(hdev, cp->name); hci_update_eir_sync(hdev); } -- cgit v1.2.3 From 2d842b6c670b9bffee7c16cda284eb49644d8169 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:06:57 +0000 Subject: tcp: Remove timewait_sock_ops.twsk_destructor(). Since DCCP has been removed, sk->sk_prot->twsk_prot->twsk_destructor is always tcp_twsk_destructor(). Let's call tcp_twsk_destructor() directly in inet_twsk_free() and remove ->twsk_destructor(). While at it, tcp_twsk_destructor() is un-exported. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/timewait_sock.h | 7 ------- net/ipv4/inet_timewait_sock.c | 5 +++-- net/ipv4/tcp_ipv4.c | 1 - net/ipv4/tcp_minisocks.c | 1 - net/ipv6/tcp_ipv6.c | 1 - 5 files changed, 3 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 62b3e9f2aed4..0a85ac64a66d 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -15,13 +15,6 @@ struct timewait_sock_ops { struct kmem_cache *twsk_slab; char *twsk_slab_name; unsigned int twsk_obj_size; - void (*twsk_destructor)(struct sock *sk); }; -static inline void twsk_destructor(struct sock *sk) -{ - if (sk->sk_prot->twsk_prot->twsk_destructor != NULL) - sk->sk_prot->twsk_prot->twsk_destructor(sk); -} - #endif /* _TIMEWAIT_SOCK_H */ diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 875ff923a8ed..5b5426b8ee92 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -15,7 +15,7 @@ #include #include #include - +#include /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash @@ -74,7 +74,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; - twsk_destructor((struct sock *)tw); + + tcp_twsk_destructor((struct sock *)tw); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9543f1538359..a48b98f67b6a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2459,7 +2459,6 @@ do_time_wait: static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), - .twsk_destructor= tcp_twsk_destructor, }; void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2994c9222c9c..d1c9e4088646 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -401,7 +401,6 @@ void tcp_twsk_destructor(struct sock *sk) #endif tcp_ao_destroy_sock(sk, true); } -EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5620d9e50e19..d99717376bff 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2050,7 +2050,6 @@ void tcp_v6_early_demux(struct sk_buff *skb) static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) -- cgit v1.2.3 From cb16f4b6c73df4be16b74099f826fea30ef72426 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:06:59 +0000 Subject: tcp: Don't pass hashinfo to socket lookup helpers. These socket lookup functions required struct inet_hashinfo because they are shared by TCP and DCCP. * __inet_lookup_established() * __inet_lookup_listener() * __inet6_lookup_established() * inet6_lookup_listener() DCCP has gone, and we don't need to pass hashinfo down to them. Let's fetch net->ipv4.tcp_death_row.hashinfo directly in the above 4 functions. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c | 9 ++--- drivers/net/ethernet/netronome/nfp/crypto/tls.c | 9 ++--- include/net/inet6_hashtables.h | 18 +++------ include/net/inet_hashtables.h | 37 ++++++++---------- net/core/filter.c | 5 +-- net/ipv4/esp4.c | 4 +- net/ipv4/inet_diag.c | 6 +-- net/ipv4/inet_hashtables.c | 28 +++++++------- net/ipv4/netfilter/nf_socket_ipv4.c | 3 +- net/ipv4/netfilter/nf_tproxy_ipv4.c | 5 +-- net/ipv4/tcp_ipv4.c | 16 +++----- net/ipv4/tcp_offload.c | 3 +- net/ipv6/esp6.c | 4 +- net/ipv6/inet6_hashtables.c | 45 +++++++++++----------- net/ipv6/netfilter/nf_socket_ipv6.c | 3 +- net/ipv6/netfilter/nf_tproxy_ipv6.c | 5 +-- net/ipv6/tcp_ipv6.c | 14 +++---- net/ipv6/tcpv6_offload.c | 3 +- 18 files changed, 94 insertions(+), 123 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index 65ccb33edafb..d7a11ff9bbdb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -498,9 +498,9 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb) depth += sizeof(struct iphdr); th = (void *)iph + sizeof(struct iphdr); - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, iph->daddr, - th->dest, netdev->ifindex); + sk = inet_lookup_established(net, iph->saddr, th->source, + iph->daddr, th->dest, + netdev->ifindex); #if IS_ENABLED(CONFIG_IPV6) } else { struct ipv6hdr *ipv6h = (struct ipv6hdr *)iph; @@ -508,8 +508,7 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb) depth += sizeof(struct ipv6hdr); th = (void *)ipv6h + sizeof(struct ipv6hdr); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &ipv6h->saddr, th->source, + sk = __inet6_lookup_established(net, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->dest), netdev->ifindex, 0); #endif diff --git a/drivers/net/ethernet/netronome/nfp/crypto/tls.c b/drivers/net/ethernet/netronome/nfp/crypto/tls.c index f80f1a6953fa..f252ecdcd2cd 100644 --- a/drivers/net/ethernet/netronome/nfp/crypto/tls.c +++ b/drivers/net/ethernet/netronome/nfp/crypto/tls.c @@ -495,14 +495,13 @@ int nfp_net_tls_rx_resync_req(struct net_device *netdev, switch (ipv6h->version) { case 4: - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, iph->daddr, - th->dest, netdev->ifindex); + sk = inet_lookup_established(net, iph->saddr, th->source, + iph->daddr, th->dest, + netdev->ifindex); break; #if IS_ENABLED(CONFIG_IPV6) case 6: - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &ipv6h->saddr, th->source, + sk = __inet6_lookup_established(net, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->dest), netdev->ifindex, 0); break; diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index ab3929a2a956..1f985d2012ce 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -41,7 +41,6 @@ static inline unsigned int __inet6_ehashfn(const u32 lhash, * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -65,7 +64,6 @@ struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, inet6_ehashfn_t *ehashfn); struct sock *inet6_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, @@ -83,7 +81,6 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net, inet6_ehashfn_t *ehashfn); static inline struct sock *__inet6_lookup(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, @@ -92,14 +89,14 @@ static inline struct sock *__inet6_lookup(const struct net *net, const int dif, const int sdif, bool *refcounted) { - struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr, - sport, daddr, hnum, + struct sock *sk = __inet6_lookup_established(net, saddr, sport, + daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; - return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + return inet6_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } @@ -143,8 +140,7 @@ struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, return reuse_sk; } -static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, +static inline struct sock *__inet6_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, int iif, int sdif, @@ -161,14 +157,12 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet6_lookup(net, hashinfo, skb, - doff, &ip6h->saddr, sport, + return __inet6_lookup(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } -struct sock *inet6_lookup(const struct net *net, struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, +struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 19dbd9081d5a..a3b32241c2f2 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -294,7 +294,6 @@ int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, @@ -302,12 +301,12 @@ struct sock *__inet_lookup_listener(const struct net *net, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, __be16 dport, int dif, int sdif) + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, __be16 dport, + int dif, int sdif) { - return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } @@ -358,7 +357,6 @@ static inline bool inet_match(const struct net *net, const struct sock *sk, * not check it for lookups anymore, thanks Alexey. -DaveM */ struct sock *__inet_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); @@ -384,18 +382,16 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); -static inline struct sock * - inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, - const __be32 saddr, const __be16 sport, - const __be32 daddr, const __be16 dport, - const int dif) +static inline struct sock *inet_lookup_established(struct net *net, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + const int dif) { - return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, + return __inet_lookup_established(net, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, @@ -405,18 +401,17 @@ static inline struct sock *__inet_lookup(struct net *net, u16 hnum = ntohs(dport); struct sock *sk; - sk = __inet_lookup_established(net, hashinfo, saddr, sport, + sk = __inet_lookup_established(net, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; - return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, + return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, @@ -425,7 +420,7 @@ static inline struct sock *inet_lookup(struct net *net, struct sock *sk; bool refcounted; - sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet_lookup(net, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) @@ -473,8 +468,7 @@ struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, return reuse_sk; } -static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, +static inline struct sock *__inet_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, @@ -492,8 +486,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet_lookup(net, hashinfo, skb, - doff, iph->saddr, sport, + return __inet_lookup(net, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } diff --git a/net/core/filter.c b/net/core/filter.c index 63f3baee2daf..5da1cad66be2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6767,7 +6767,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; @@ -6776,7 +6775,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, hinfo, NULL, 0, + sk = __inet_lookup(net, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); @@ -6790,7 +6789,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, hinfo, NULL, 0, + sk = __inet6_lookup(net, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f14a41ee4aa1..2c922afadb8f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -132,8 +132,8 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) dport = encap->encap_dport; spin_unlock_bh(&x->lock); - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, - dport, x->props.saddr.a4, sport, 0); + sk = inet_lookup_established(net, x->id.daddr.a4, dport, + x->props.saddr.a4, sport, 0); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 549f1f521f4f..462406948c84 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -526,18 +526,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net, rcu_read_lock(); if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if); else - sk = inet6_lookup(net, hashinfo, NULL, 0, + sk = inet6_lookup(net, NULL, 0, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 374adb8a2640..4bc2b1921d2b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -425,13 +425,13 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net, } struct sock *__inet_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; @@ -444,6 +444,7 @@ struct sock *__inet_lookup_listener(const struct net *net, goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -489,21 +490,22 @@ void sock_edemux(struct sk_buff *skb) EXPORT_SYMBOL(sock_edemux); struct sock *__inet_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, - const __be32 saddr, const __be16 sport, - const __be32 daddr, const u16 hnum, - const int dif, const int sdif) + const __be32 saddr, const __be16 sport, + const __be32 daddr, const u16 hnum, + const int dif, const int sdif) { - INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - struct sock *sk; + INET_ADDR_COOKIE(acookie, saddr, daddr); const struct hlist_nulls_node *node; - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index a1350fc25838..5080fa5fbf6a 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -71,8 +71,7 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, { switch (protocol) { case IPPROTO_TCP: - return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 73e66a088e25..041c3f37f237 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -81,7 +81,6 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -95,7 +94,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, hinfo, skb, + sk = inet_lookup_listener(net, skb, ip_hdrlen(skb) + __tcp_hdrlen(hp), saddr, sport, daddr, dport, in->ifindex, 0); @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, hinfo, saddr, sport, + sk = inet_lookup_established(net, saddr, sport, daddr, dport, in->ifindex); break; default: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a48b98f67b6a..a0c93b24c6e0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -506,8 +506,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) struct sock *sk; int err; - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->daddr, th->dest, iph->saddr, + sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, ntohs(th->source), inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); @@ -823,8 +822,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ @@ -1992,8 +1990,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (th->doff < sizeof(struct tcphdr) / 4) return 0; - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, + sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), skb->skb_iif, inet_sdif(skb)); if (sk) { @@ -2236,8 +2233,7 @@ int tcp_v4_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); lookup: - sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), th->source, + sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -2426,9 +2422,7 @@ do_time_wait: &drop_reason); switch (tw_status) { case TCP_TW_SYN: { - struct sock *sk2 = inet_lookup_listener(net, - net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb), diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index be5c2294610e..e6612bd84d09 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -434,8 +434,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, + sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 72adfc107b55..e75da98f5283 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -149,8 +149,8 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) dport = encap->encap_dport; spin_unlock_bh(&x->lock); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, - dport, &x->props.saddr.in6, ntohs(sport), 0, 0); + sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport, + &x->props.saddr.in6, ntohs(sport), 0, 0); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index d6c3db31dcab..a3a9ea49fee2 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -47,24 +47,23 @@ EXPORT_SYMBOL_GPL(inet6_ehashfn); * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, - const struct in6_addr *saddr, - const __be16 sport, - const struct in6_addr *daddr, - const u16 hnum, - const int dif, const int sdif) + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif, const int sdif) { - struct sock *sk; - const struct hlist_nulls_node *node; const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - + const struct hlist_nulls_node *node; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -200,13 +199,15 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net, EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - const __be16 sport, const struct in6_addr *daddr, - const unsigned short hnum, const int dif, const int sdif) + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const unsigned short hnum, + const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; @@ -219,6 +220,7 @@ struct sock *inet6_lookup_listener(const struct net *net, goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -243,7 +245,6 @@ done: EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, @@ -252,7 +253,7 @@ struct sock *inet6_lookup(const struct net *net, struct sock *sk; bool refcounted; - sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 9ea5ef56cb27..ced8bd44828e 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -83,8 +83,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet6_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp6_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 52f828bb5a83..b2f59ed9d7cc 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -80,7 +80,6 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -94,7 +93,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, hinfo, skb, + sk = inet6_lookup_listener(net, skb, thoff + __tcp_hdrlen(hp), saddr, sport, daddr, ntohs(dport), @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr, + sk = __inet6_lookup_established(net, saddr, sport, daddr, ntohs(dport), in->ifindex, 0); break; default: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d99717376bff..8b2e7b7afbd8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -388,8 +388,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bool fatal; int err; - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->daddr, th->dest, + sk = __inet6_lookup_established(net, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); @@ -1073,8 +1072,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, &ipv6h->saddr, th->source, + sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) @@ -1789,7 +1787,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), + sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) @@ -1976,8 +1974,7 @@ do_time_wait: { struct sock *sk2; - sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), @@ -2029,8 +2026,7 @@ void tcp_v6_early_demux(struct sk_buff *skb) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index a8a04f441e78..effeba58630b 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -36,8 +36,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet6_get_iif_sdif(skb, &iif, &sdif); hdr = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; -- cgit v1.2.3 From d2b007374551ac09db16badde575cdd698f6fc92 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:50 +0300 Subject: devlink: Move graceful period parameter to reporter ops Move the default graceful period from a parameter to devlink_health_reporter_create() to a field in the devlink_health_reporter_ops structure. This change improves consistency, as the graceful period is inherently tied to the reporter's behavior and recovery policy. It simplifies the signature of devlink_health_reporter_create() and its internal helper functions. It also centralizes the reporter configuration at the ops structure, preparing the groundwork for a downstream patch that will introduce a devlink health reporter burst period attribute whose default value will similarly be provided by the driver via the ops structure. Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/main.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 2 +- drivers/net/ethernet/huawei/hinic/hinic_devlink.c | 10 +++--- drivers/net/ethernet/intel/ice/devlink/health.c | 3 +- .../ethernet/marvell/octeontx2/af/rvu_devlink.c | 32 ++++++++++++----- .../mellanox/mlx5/core/diag/reporter_vnic.c | 2 +- .../ethernet/mellanox/mlx5/core/en/reporter_rx.c | 10 +++--- .../ethernet/mellanox/mlx5/core/en/reporter_tx.c | 10 +++--- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/health.c | 41 +++++++++++++--------- drivers/net/ethernet/mellanox/mlxsw/core.c | 2 +- drivers/net/ethernet/qlogic/qed/qed_devlink.c | 9 ++--- drivers/net/netdevsim/health.c | 4 +-- include/net/devlink.h | 11 +++--- net/devlink/health.c | 28 ++++++--------- 15 files changed, 97 insertions(+), 71 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c index 9b81e1c260c2..c7a2eff57632 100644 --- a/drivers/net/ethernet/amd/pds_core/main.c +++ b/drivers/net/ethernet/amd/pds_core/main.c @@ -280,7 +280,7 @@ static int pdsc_init_pf(struct pdsc *pdsc) goto err_out_del_dev; } - hr = devl_health_reporter_create(dl, &pdsc_fw_reporter_ops, 0, pdsc); + hr = devl_health_reporter_create(dl, &pdsc_fw_reporter_ops, pdsc); if (IS_ERR(hr)) { devl_unlock(dl); dev_warn(pdsc->dev, "Failed to create fw reporter: %pe\n", hr); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 4c4581b0342e..43fb75806cd6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -220,7 +220,7 @@ __bnxt_dl_reporter_create(struct bnxt *bp, { struct devlink_health_reporter *reporter; - reporter = devlink_health_reporter_create(bp->dl, ops, 0, bp); + reporter = devlink_health_reporter_create(bp->dl, ops, bp); if (IS_ERR(reporter)) { netdev_warn(bp->dev, "Failed to create %s health reporter, rc = %ld\n", ops->name, PTR_ERR(reporter)); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c index 03e42512a2d5..300bc267a259 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c @@ -443,8 +443,9 @@ int hinic_health_reporters_create(struct hinic_devlink_priv *priv) struct devlink *devlink = priv_to_devlink(priv); priv->hw_fault_reporter = - devlink_health_reporter_create(devlink, &hinic_hw_fault_reporter_ops, - 0, priv); + devlink_health_reporter_create(devlink, + &hinic_hw_fault_reporter_ops, + priv); if (IS_ERR(priv->hw_fault_reporter)) { dev_warn(&priv->hwdev->hwif->pdev->dev, "Failed to create hw fault reporter, err: %ld\n", PTR_ERR(priv->hw_fault_reporter)); @@ -452,8 +453,9 @@ int hinic_health_reporters_create(struct hinic_devlink_priv *priv) } priv->fw_fault_reporter = - devlink_health_reporter_create(devlink, &hinic_fw_fault_reporter_ops, - 0, priv); + devlink_health_reporter_create(devlink, + &hinic_fw_fault_reporter_ops, + priv); if (IS_ERR(priv->fw_fault_reporter)) { dev_warn(&priv->hwdev->hwif->pdev->dev, "Failed to create fw fault reporter, err: %ld\n", PTR_ERR(priv->fw_fault_reporter)); diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c index ab519c0f28bf..8e9a8a8178d4 100644 --- a/drivers/net/ethernet/intel/ice/devlink/health.c +++ b/drivers/net/ethernet/intel/ice/devlink/health.c @@ -450,9 +450,8 @@ ice_init_devlink_rep(struct ice_pf *pf, { struct devlink *devlink = priv_to_devlink(pf); struct devlink_health_reporter *rep; - const u64 graceful_period = 0; - rep = devl_health_reporter_create(devlink, ops, graceful_period, pf); + rep = devl_health_reporter_create(devlink, ops, pf); if (IS_ERR(rep)) { struct device *dev = ice_pf_to_dev(pf); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c index 27c3a2daaaa9..3735372539bd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c @@ -505,7 +505,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) rvu_reporters->nix_event_ctx = nix_event_context; rvu_reporters->rvu_hw_nix_intr_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_intr_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_intr_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_intr_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_intr reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_intr_reporter)); @@ -513,7 +515,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_nix_gen_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_gen_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_gen_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_gen_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_gen reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_gen_reporter)); @@ -521,7 +525,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_nix_err_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_err_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_err_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_err_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_err reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_err_reporter)); @@ -529,7 +535,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_nix_ras_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_ras_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_ras_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_ras_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_ras reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_ras_reporter)); @@ -1051,7 +1059,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) rvu_reporters->npa_event_ctx = npa_event_context; rvu_reporters->rvu_hw_npa_intr_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_intr_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_intr_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_intr_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_intr reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_intr_reporter)); @@ -1059,7 +1069,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_npa_gen_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_gen_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_gen_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_gen_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_gen reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_gen_reporter)); @@ -1067,7 +1079,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_npa_err_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_err_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_err_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_err_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_err reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_err_reporter)); @@ -1075,7 +1089,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_npa_ras_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_ras_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_ras_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_ras_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_ras reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_ras_reporter)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c index 32bb769f1829..73f5b62b8c7f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c @@ -135,7 +135,7 @@ void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev) health->vnic_reporter = devlink_health_reporter_create(devlink, &mlx5_reporter_vnic_ops, - 0, dev); + dev); if (IS_ERR(health->vnic_reporter)) mlx5_core_warn(dev, "Failed to create vnic reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index 16c44d628eda..1b9ea72abc5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -651,22 +651,24 @@ void mlx5e_reporter_icosq_resume_recovery(struct mlx5e_channel *c) mutex_unlock(&c->icosq_recovery_lock); } +#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 + static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { .name = "rx", .recover = mlx5e_rx_reporter_recover, .diagnose = mlx5e_rx_reporter_diagnose, .dump = mlx5e_rx_reporter_dump, + .default_graceful_period = MLX5E_REPORTER_RX_GRACEFUL_PERIOD, }; -#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 - void mlx5e_reporter_rx_create(struct mlx5e_priv *priv) { + struct devlink_port *port = priv->netdev->devlink_port; struct devlink_health_reporter *reporter; - reporter = devlink_port_health_reporter_create(priv->netdev->devlink_port, + reporter = devlink_port_health_reporter_create(port, &mlx5_rx_reporter_ops, - MLX5E_REPORTER_RX_GRACEFUL_PERIOD, priv); + priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", PTR_ERR(reporter)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 85d5cb39b107..7a4a77f6fe6a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -539,22 +539,24 @@ void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq) mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); } +#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 + static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { .name = "tx", .recover = mlx5e_tx_reporter_recover, .diagnose = mlx5e_tx_reporter_diagnose, .dump = mlx5e_tx_reporter_dump, + .default_graceful_period = MLX5_REPORTER_TX_GRACEFUL_PERIOD, }; -#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 - void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) { + struct devlink_port *port = priv->netdev->devlink_port; struct devlink_health_reporter *reporter; - reporter = devlink_port_health_reporter_create(priv->netdev->devlink_port, + reporter = devlink_port_health_reporter_create(port, &mlx5_tx_reporter_ops, - MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); + priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, "Failed to create tx reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 63a7a788fb0d..b231e7855bca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1447,7 +1447,7 @@ static void mlx5e_rep_vnic_reporter_create(struct mlx5e_priv *priv, reporter = devl_port_health_reporter_create(dl_port, &mlx5_rep_vnic_reporter_ops, - 0, rpriv); + rpriv); if (IS_ERR(reporter)) { mlx5_core_err(priv->mdev, "Failed to create representor vnic reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index cf7a1edd0530..b63c5a221eb9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -669,54 +669,61 @@ static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work) } } +#define MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD 180000 +#define MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD 60000 +#define MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD 30000 +#define MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD \ + MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD + +static +const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ecpf_ops = { + .name = "fw_fatal", + .recover = mlx5_fw_fatal_reporter_recover, + .dump = mlx5_fw_fatal_reporter_dump, + .default_graceful_period = + MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD, +}; + static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_pf_ops = { .name = "fw_fatal", .recover = mlx5_fw_fatal_reporter_recover, .dump = mlx5_fw_fatal_reporter_dump, + .default_graceful_period = MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD, }; static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = { .name = "fw_fatal", .recover = mlx5_fw_fatal_reporter_recover, + .default_graceful_period = + MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD, }; -#define MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD 180000 -#define MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD 60000 -#define MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD 30000 -#define MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD - void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) { const struct devlink_health_reporter_ops *fw_fatal_ops; struct mlx5_core_health *health = &dev->priv.health; const struct devlink_health_reporter_ops *fw_ops; struct devlink *devlink = priv_to_devlink(dev); - u64 grace_period; - fw_fatal_ops = &mlx5_fw_fatal_reporter_pf_ops; fw_ops = &mlx5_fw_reporter_pf_ops; if (mlx5_core_is_ecpf(dev)) { - grace_period = MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD; + fw_fatal_ops = &mlx5_fw_fatal_reporter_ecpf_ops; } else if (mlx5_core_is_pf(dev)) { - grace_period = MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD; + fw_fatal_ops = &mlx5_fw_fatal_reporter_pf_ops; } else { /* VF or SF */ - grace_period = MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD; fw_fatal_ops = &mlx5_fw_fatal_reporter_ops; fw_ops = &mlx5_fw_reporter_ops; } - health->fw_reporter = - devl_health_reporter_create(devlink, fw_ops, 0, dev); + health->fw_reporter = devl_health_reporter_create(devlink, fw_ops, dev); if (IS_ERR(health->fw_reporter)) mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", PTR_ERR(health->fw_reporter)); - health->fw_fatal_reporter = - devl_health_reporter_create(devlink, - fw_fatal_ops, - grace_period, - dev); + health->fw_fatal_reporter = devl_health_reporter_create(devlink, + fw_fatal_ops, + dev); if (IS_ERR(health->fw_fatal_reporter)) mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", PTR_ERR(health->fw_fatal_reporter)); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 2bb2b77351bd..980f3223f124 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -2043,7 +2043,7 @@ static int mlxsw_core_health_init(struct mlxsw_core *mlxsw_core) return 0; fw_fatal = devl_health_reporter_create(devlink, &mlxsw_core_health_fw_fatal_ops, - 0, mlxsw_core); + mlxsw_core); if (IS_ERR(fw_fatal)) { dev_err(mlxsw_core->bus_info->dev, "Failed to create fw fatal reporter"); return PTR_ERR(fw_fatal); diff --git a/drivers/net/ethernet/qlogic/qed/qed_devlink.c b/drivers/net/ethernet/qlogic/qed/qed_devlink.c index 1adc7fbb3f2f..94c5689b5abd 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_devlink.c +++ b/drivers/net/ethernet/qlogic/qed/qed_devlink.c @@ -87,20 +87,21 @@ qed_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, return 0; } +#define QED_REPORTER_FW_GRACEFUL_PERIOD 0 + static const struct devlink_health_reporter_ops qed_fw_fatal_reporter_ops = { .name = "fw_fatal", .recover = qed_fw_fatal_reporter_recover, .dump = qed_fw_fatal_reporter_dump, + .default_graceful_period = QED_REPORTER_FW_GRACEFUL_PERIOD, }; -#define QED_REPORTER_FW_GRACEFUL_PERIOD 0 - void qed_fw_reporters_create(struct devlink *devlink) { struct qed_devlink *dl = devlink_priv(devlink); - dl->fw_reporter = devlink_health_reporter_create(devlink, &qed_fw_fatal_reporter_ops, - QED_REPORTER_FW_GRACEFUL_PERIOD, dl); + dl->fw_reporter = devlink_health_reporter_create(devlink, + &qed_fw_fatal_reporter_ops, dl); if (IS_ERR(dl->fw_reporter)) { DP_NOTICE(dl->cdev, "Failed to create fw reporter, err = %ld\n", PTR_ERR(dl->fw_reporter)); diff --git a/drivers/net/netdevsim/health.c b/drivers/net/netdevsim/health.c index 688f05316b5e..3bd0e7a489c3 100644 --- a/drivers/net/netdevsim/health.c +++ b/drivers/net/netdevsim/health.c @@ -183,14 +183,14 @@ int nsim_dev_health_init(struct nsim_dev *nsim_dev, struct devlink *devlink) health->empty_reporter = devl_health_reporter_create(devlink, &nsim_dev_empty_reporter_ops, - 0, health); + health); if (IS_ERR(health->empty_reporter)) return PTR_ERR(health->empty_reporter); health->dummy_reporter = devl_health_reporter_create(devlink, &nsim_dev_dummy_reporter_ops, - 0, health); + health); if (IS_ERR(health->dummy_reporter)) { err = PTR_ERR(health->dummy_reporter); goto err_empty_reporter_destroy; diff --git a/include/net/devlink.h b/include/net/devlink.h index 3119d053bc4d..c7ad7a981b39 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -746,6 +746,8 @@ enum devlink_health_reporter_state { * if priv_ctx is NULL, run a full dump * @diagnose: callback to diagnose the current status * @test: callback to trigger a test event + * @default_graceful_period: default min time (in msec) + * between recovery attempts */ struct devlink_health_reporter_ops { @@ -760,6 +762,7 @@ struct devlink_health_reporter_ops { struct netlink_ext_ack *extack); int (*test)(struct devlink_health_reporter *reporter, struct netlink_ext_ack *extack); + u64 default_graceful_period; }; /** @@ -1928,22 +1931,22 @@ void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); void devl_health_reporter_destroy(struct devlink_health_reporter *reporter); diff --git a/net/devlink/health.c b/net/devlink/health.c index b3ce8ecbb7fb..ba144b7426fa 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -108,11 +108,11 @@ devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, static struct devlink_health_reporter * __devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; - if (WARN_ON(graceful_period && !ops->recover)) + if (WARN_ON(ops->default_graceful_period && !ops->recover)) return ERR_PTR(-EINVAL); reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); @@ -122,7 +122,7 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->priv = priv; reporter->ops = ops; reporter->devlink = devlink; - reporter->graceful_period = graceful_period; + reporter->graceful_period = ops->default_graceful_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; @@ -134,13 +134,12 @@ __devlink_health_reporter_create(struct devlink *devlink, * * @port: devlink_port to which health reports will relate * @ops: devlink health reporter ops - * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; @@ -150,8 +149,7 @@ devl_port_health_reporter_create(struct devlink_port *port, ops->name)) return ERR_PTR(-EEXIST); - reporter = __devlink_health_reporter_create(port->devlink, ops, - graceful_period, priv); + reporter = __devlink_health_reporter_create(port->devlink, ops, priv); if (IS_ERR(reporter)) return reporter; @@ -164,14 +162,13 @@ EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; struct devlink *devlink = port->devlink; devl_lock(devlink); - reporter = devl_port_health_reporter_create(port, ops, - graceful_period, priv); + reporter = devl_port_health_reporter_create(port, ops, priv); devl_unlock(devlink); return reporter; } @@ -182,13 +179,12 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); * * @devlink: devlink instance which the health reports will relate * @ops: devlink health reporter ops - * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; @@ -197,8 +193,7 @@ devl_health_reporter_create(struct devlink *devlink, if (devlink_health_reporter_find_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); - reporter = __devlink_health_reporter_create(devlink, ops, - graceful_period, priv); + reporter = __devlink_health_reporter_create(devlink, ops, priv); if (IS_ERR(reporter)) return reporter; @@ -210,13 +205,12 @@ EXPORT_SYMBOL_GPL(devl_health_reporter_create); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; devl_lock(devlink); - reporter = devl_health_reporter_create(devlink, ops, - graceful_period, priv); + reporter = devl_health_reporter_create(devlink, ops, priv); devl_unlock(devlink); return reporter; } -- cgit v1.2.3 From 6a06d8c40510ba1ecf27977f528b1eb74f290a60 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:52 +0300 Subject: devlink: Introduce burst period for health reporter Currently, the devlink health reporter starts the grace period immediately after handling an error, blocking any further recoveries until it finished. However, when a single root cause triggers multiple errors in a short time frame, it is desirable to treat them as a bulk of errors and to allow their recoveries, avoiding premature blocking of subsequent related errors, and reducing the risk of inconsistent or incomplete error handling. To address this, introduce a configurable burst period for devlink health reporter. Start this period when the first error is handled, and allow recovery attempts for reported errors during this window. Once burst period expires, begin the grace period to block further recoveries until it concludes. Timeline summary: ----|--------|------------------------------/----------------------/-- error is error is burst period grace period reported recovered (recoveries allowed) (recoveries blocked) For calculating the burst period duration, use the same last_recovery_ts as the grace period. Update it on recovery only when the burst period is inactive (either disabled or at the first error). This patch implements the framework for the burst period and effectively sets its value to 0 at reporter creation, so the current behavior remains unchanged, which ensures backward compatibility. A downstream patch will make the burst period configurable. Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 3 +++ net/devlink/health.c | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index c7ad7a981b39..5f44e702c25c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -748,6 +748,8 @@ enum devlink_health_reporter_state { * @test: callback to trigger a test event * @default_graceful_period: default min time (in msec) * between recovery attempts + * @default_burst_period: default time (in msec) for + * error recoveries before starting the grace period */ struct devlink_health_reporter_ops { @@ -763,6 +765,7 @@ struct devlink_health_reporter_ops { int (*test)(struct devlink_health_reporter *reporter, struct netlink_ext_ack *extack); u64 default_graceful_period; + u64 default_burst_period; }; /** diff --git a/net/devlink/health.c b/net/devlink/health.c index 9d0d4a9face7..94ab77f77add 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -60,6 +60,7 @@ struct devlink_health_reporter { struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; u64 graceful_period; + u64 burst_period; bool auto_recover; bool auto_dump; u8 health_state; @@ -123,6 +124,7 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->ops = ops; reporter->devlink = devlink; reporter->graceful_period = ops->default_graceful_period; + reporter->burst_period = ops->default_burst_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; @@ -508,11 +510,25 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, devlink_nl_notify_send_desc(devlink, msg, &desc); } +static bool +devlink_health_reporter_in_burst(struct devlink_health_reporter *reporter) +{ + unsigned long burst_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->burst_period); + + return time_is_after_jiffies(burst_threshold); +} + void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { reporter->recovery_count++; - reporter->last_recovery_ts = jiffies; + if (!devlink_health_reporter_in_burst(reporter)) + /* When burst period is set, last_recovery_ts marks the first + * recovery within the burst period, not necessarily the last + * one. + */ + reporter->last_recovery_ts = jiffies; } EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); @@ -599,7 +615,11 @@ devlink_health_recover_abort(struct devlink_health_reporter *reporter, if (prev_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) return true; + if (devlink_health_reporter_in_burst(reporter)) + return false; + recover_ts_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->burst_period) + msecs_to_jiffies(reporter->graceful_period); if (reporter->last_recovery_ts && reporter->recovery_count && time_is_after_jiffies(recover_ts_threshold)) -- cgit v1.2.3 From 1bec9d0c0046fe4e2bfb6a1c5aadcb5d56cdb0fb Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 25 Aug 2025 15:37:43 +0200 Subject: ipv4: Convert ->flowi4_tos to dscp_t. Convert the ->flowic_tos field of struct flowi_common from __u8 to dscp_t, rename it ->flowic_dscp and propagate these changes to struct flowi and struct flowi4. We've had several bugs in the past where ECN bits could interfere with IPv4 routing, because these bits were not properly cleared when setting ->flowi4_tos. These bugs should be fixed now and the dscp_t type has been introduced to ensure that variables carrying DSCP values don't accidentally have any ECN bits set. Several variables and structure fields have been converted to dscp_t already, but the main IPv4 routing structure, struct flowi4, is still using a __u8. To avoid any future regression, this patch converts it to dscp_t. There are many users to convert at once. Fortunately, around half of ->flowi4_tos users already have a dscp_t value at hand, which they currently convert to __u8 using inet_dscp_to_dsfield(). For all of these users, we just need to drop that conversion. But, although we try to do the __u8 <-> dscp_t conversions at the boundaries of the network or of user space, some places still store TOS/DSCP variables as __u8 in core networking code. Those can hardly be converted either because the data structure is part of UAPI or because the same variable or field is also used for handling ECN in other parts of the code. In all of these cases where we don't have a dscp_t variable at hand, we need to use inet_dsfield_to_dscp() when interacting with ->flowi4_dscp. Changes since v1: * Fix space alignment in __bpf_redirect_neigh_v4() (Ido). Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/29acecb45e911d17446b9a3dbdb1ab7b821ea371.1756128932.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 6 ++++-- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 7 ++++--- drivers/net/ethernet/sfc/tc_encap_actions.c | 4 +++- drivers/net/gtp.c | 7 ++++--- drivers/net/ipvlan/ipvlan_core.c | 4 ++-- drivers/net/vrf.c | 4 ++-- include/net/flow.h | 11 ++++++----- include/net/inet_dscp.h | 6 ++++++ include/net/ip_fib.h | 2 +- include/net/ip_tunnels.h | 4 +++- include/net/route.h | 2 +- include/trace/events/fib.h | 4 +++- net/core/filter.c | 4 ++-- net/core/lwt_bpf.c | 4 ++-- net/ipv4/fib_frontend.c | 7 ++++--- net/ipv4/fib_rules.c | 4 ++-- net/ipv4/icmp.c | 5 +++-- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_output.c | 3 ++- net/ipv4/ipmr.c | 3 ++- net/ipv4/netfilter.c | 4 ++-- net/ipv4/netfilter/ipt_rpfilter.c | 4 ++-- net/ipv4/netfilter/nf_dup_ipv4.c | 4 ++-- net/ipv4/netfilter/nft_fib_ipv4.c | 4 ++-- net/ipv4/route.c | 8 ++++---- net/ipv4/udp_tunnel_core.c | 3 ++- net/ipv4/xfrm4_policy.c | 4 ++-- net/netfilter/nft_flow_offload.c | 4 ++-- net/sctp/protocol.c | 3 ++- net/xfrm/xfrm_policy.c | 6 +++--- 30 files changed, 81 insertions(+), 58 deletions(-) (limited to 'include/net') diff --git a/drivers/net/amt.c b/drivers/net/amt.c index ed86537b2f61..902c817a0dea 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include static struct workqueue_struct *amt_wq; @@ -1018,7 +1020,7 @@ static bool amt_send_membership_update(struct amt_dev *amt, fl4.flowi4_oif = amt->stream_dev->ifindex; fl4.daddr = amt->remote_ip; fl4.saddr = amt->local_ip; - fl4.flowi4_tos = AMT_TOS; + fl4.flowi4_dscp = inet_dsfield_to_dscp(AMT_TOS); fl4.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(amt->net, &fl4); if (IS_ERR(rt)) { @@ -1133,7 +1135,7 @@ static bool amt_send_membership_query(struct amt_dev *amt, fl4.flowi4_oif = amt->stream_dev->ifindex; fl4.daddr = tunnel->ip4; fl4.saddr = amt->local_ip; - fl4.flowi4_tos = AMT_TOS; + fl4.flowi4_dscp = inet_dsfield_to_dscp(AMT_TOS); fl4.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(amt->net, &fl4); if (IS_ERR(rt)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 2162d776fe35..a14f216048cd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -1,7 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* Copyright (c) 2018 Mellanox Technologies. */ -#include +#include +#include #include #include #include @@ -233,7 +234,7 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, int err; /* add the IP fields */ - attr.fl.fl4.flowi4_tos = tun_key->tos & ~INET_ECN_MASK; + attr.fl.fl4.flowi4_dscp = inet_dsfield_to_dscp(tun_key->tos); attr.fl.fl4.daddr = tun_key->u.ipv4.dst; attr.fl.fl4.saddr = tun_key->u.ipv4.src; attr.ttl = tun_key->ttl; @@ -349,7 +350,7 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv, int err; /* add the IP fields */ - attr.fl.fl4.flowi4_tos = tun_key->tos & ~INET_ECN_MASK; + attr.fl.fl4.flowi4_dscp = inet_dsfield_to_dscp(tun_key->tos); attr.fl.fl4.daddr = tun_key->u.ipv4.dst; attr.fl.fl4.saddr = tun_key->u.ipv4.src; attr.ttl = tun_key->ttl; diff --git a/drivers/net/ethernet/sfc/tc_encap_actions.c b/drivers/net/ethernet/sfc/tc_encap_actions.c index e872f926e438..eef06e48185d 100644 --- a/drivers/net/ethernet/sfc/tc_encap_actions.c +++ b/drivers/net/ethernet/sfc/tc_encap_actions.c @@ -11,6 +11,8 @@ #include "tc_encap_actions.h" #include "tc.h" #include "mae.h" +#include +#include #include #include #include @@ -99,7 +101,7 @@ static int efx_bind_neigh(struct efx_nic *efx, case EFX_ENCAP_TYPE_GENEVE: flow4.flowi4_proto = IPPROTO_UDP; flow4.fl4_dport = encap->key.tp_dst; - flow4.flowi4_tos = encap->key.tos; + flow4.flowi4_dscp = inet_dsfield_to_dscp(encap->key.tos); flow4.daddr = encap->key.u.ipv4.dst; flow4.saddr = encap->key.u.ipv4.src; break; diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 4b668ebaa0f7..5cb59d72bc82 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -21,9 +21,10 @@ #include #include +#include +#include #include #include -#include #include #include #include @@ -352,7 +353,7 @@ static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4, fl4->flowi4_oif = sk->sk_bound_dev_if; fl4->daddr = daddr; fl4->saddr = saddr; - fl4->flowi4_tos = inet_dscp_to_dsfield(inet_sk_dscp(inet_sk(sk))); + fl4->flowi4_dscp = inet_sk_dscp(inet_sk(sk)); fl4->flowi4_scope = ip_sock_rt_scope(sk); fl4->flowi4_proto = sk->sk_protocol; @@ -2401,7 +2402,7 @@ static int gtp_genl_send_echo_req(struct sk_buff *skb, struct genl_info *info) udp_tunnel_xmit_skb(rt, sk, skb_to_send, fl4.saddr, fl4.daddr, - fl4.flowi4_tos, + inet_dscp_to_dsfield(fl4.flowi4_dscp), ip4_dst_hoplimit(&rt->dst), 0, port, port, diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index e3e65772c599..d7e3ddbcab6f 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -2,7 +2,7 @@ /* Copyright (c) 2014 Mahesh Bandewar */ -#include +#include #include #include "ipvlan.h" @@ -433,7 +433,7 @@ static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) ip4h = ip_hdr(skb); fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); + fl4.flowi4_dscp = ip4h_dscp(ip4h); rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 3ccd649913b5..571847a7f86d 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -38,7 +39,6 @@ #include #include #include -#include #define DRV_NAME "vrf" #define DRV_VERSION "1.1" @@ -505,7 +505,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); + fl4.flowi4_dscp = ip4h_dscp(ip4h); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; diff --git a/include/net/flow.h b/include/net/flow.h index a1839c278d87..ae9481c40063 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -12,6 +12,7 @@ #include #include #include +#include struct flow_keys; @@ -32,7 +33,7 @@ struct flowi_common { int flowic_iif; int flowic_l3mdev; __u32 flowic_mark; - __u8 flowic_tos; + dscp_t flowic_dscp; __u8 flowic_scope; __u8 flowic_proto; __u8 flowic_flags; @@ -70,7 +71,7 @@ struct flowi4 { #define flowi4_iif __fl_common.flowic_iif #define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark -#define flowi4_tos __fl_common.flowic_tos +#define flowi4_dscp __fl_common.flowic_dscp #define flowi4_scope __fl_common.flowic_scope #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags @@ -103,7 +104,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; - fl4->flowi4_tos = tos; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_scope = scope; fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; @@ -141,7 +142,7 @@ struct flowi6 { #define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; - /* Note: flowi6_tos is encoded in flowlabel, too. */ + /* Note: flowi6_dscp is encoded in flowlabel, too. */ __be32 flowlabel; union flowi_uli uli; #define fl6_sport uli.ports.sport @@ -163,7 +164,7 @@ struct flowi { #define flowi_iif u.__fl_common.flowic_iif #define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark -#define flowi_tos u.__fl_common.flowic_tos +#define flowi_dscp u.__fl_common.flowic_dscp #define flowi_scope u.__fl_common.flowic_scope #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h index 72f250dffada..1aa9f04ed1ab 100644 --- a/include/net/inet_dscp.h +++ b/include/net/inet_dscp.h @@ -39,6 +39,12 @@ typedef u8 __bitwise dscp_t; #define INET_DSCP_MASK 0xfc +/* A few places in the IPv4 code need to ignore the three high order bits of + * DSCP because of backward compatibility (as these bits used to represent the + * IPv4 Precedence in RFC 791's TOS field and were ignored). + */ +#define INET_DSCP_LEGACY_TOS_MASK ((__force dscp_t)0x1c) + static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield) { return (__force dscp_t)(dsfield & INET_DSCP_MASK); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 48bb3cf41469..b4495c38e0a0 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -440,7 +440,7 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, static inline bool fib_dscp_masked_match(dscp_t dscp, const struct flowi4 *fl4) { - return dscp == inet_dsfield_to_dscp(RT_TOS(fl4->flowi4_tos)); + return dscp == (fl4->flowi4_dscp & INET_DSCP_LEGACY_TOS_MASK); } /* Exported by fib_frontend.c */ diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 8cf1380f3656..4314a97702ea 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -11,7 +11,9 @@ #include #include +#include #include +#include #include #include #include @@ -362,7 +364,7 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, fl4->daddr = daddr; fl4->saddr = saddr; - fl4->flowi4_tos = tos; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; diff --git a/include/net/route.h b/include/net/route.h index 7ea840daa775..c71998f464f8 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -189,7 +189,7 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, { struct flowi4 fl4 = { .flowi4_oif = oif, - .flowi4_tos = inet_dscp_to_dsfield(dscp), + .flowi4_dscp = dscp, .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h index 20b914250ce9..feb28b359eff 100644 --- a/include/trace/events/fib.h +++ b/include/trace/events/fib.h @@ -7,6 +7,8 @@ #include #include +#include +#include #include #include @@ -44,7 +46,7 @@ TRACE_EVENT(fib_table_lookup, __entry->err = err; __entry->oif = flp->flowi4_oif; __entry->iif = flp->flowi4_iif; - __entry->tos = flp->flowi4_tos; + __entry->tos = inet_dscp_to_dsfield(flp->flowi4_dscp); __entry->scope = flp->flowi4_scope; __entry->flags = flp->flowi4_flags; diff --git a/net/core/filter.c b/net/core/filter.c index 5da1cad66be2..b005363f482c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2373,7 +2373,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), + .flowi4_dscp = ip4h_dscp(ip4h), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, @@ -6020,7 +6020,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } - fl4.flowi4_tos = params->tos & INET_DSCP_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index ae74634310a3..9f40be0c3e71 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -8,12 +8,12 @@ #include #include #include +#include #include #include #include #include #include -#include struct bpf_lwt_prog { struct bpf_prog *prog; @@ -209,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6e1b94796f67..1dab44e13d3b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -293,7 +294,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))), + .flowi4_dscp = ip4h_dscp(ip_hdr(skb)), .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; @@ -358,7 +359,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; @@ -1372,7 +1373,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, - .flowi4_tos = frn->fl_tos & INET_DSCP_MASK, + .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos), .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index fa58d6620ed6..51f0193092f0 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -193,8 +194,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, * to mask the upper three DSCP bits prior to matching to maintain * legacy behavior. */ - if (r->dscp_full && - (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask) + if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask) return 0; else if (!r->dscp_full && r->dscp && !fib_dscp_masked_match(r->dscp, fl4)) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 91765057aa1d..7248c15cbd75 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -444,7 +445,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.saddr = saddr; fl4.flowi4_mark = mark; fl4.flowi4_uid = sock_net_uid(net, NULL); - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))); + fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb)); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); @@ -495,7 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); - fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4->flowi4_dscp = dscp; fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f5b9004d6938..761a53c6a89a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -44,7 +45,6 @@ #include #include #include -#include /* Problems & solutions @@ -930,7 +930,7 @@ static int ipgre_open(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr)) { struct flowi4 fl4 = { .flowi4_oif = t->parms.link, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)), + .flowi4_dscp = ip4h_dscp(&t->parms.iph), .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_proto = IPPROTO_GRE, .saddr = t->parms.iph.saddr, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 84e7f8a2f50f..2b96651d719b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -63,6 +63,7 @@ #include #include +#include #include #include #include @@ -485,7 +486,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, inet_sk_init_flowi4(inet, fl4); /* sctp_v4_xmit() uses its own DSCP value */ - fl4->flowi4_tos = tos & INET_DSCP_MASK; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e86a8a862c41..345e5faac634 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -2120,7 +2121,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), + .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index e60e54e7945d..ce310eb779e0 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -11,10 +11,10 @@ #include #include #include +#include #include #include #include -#include #include /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ @@ -44,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un */ fl4.daddr = iph->daddr; fl4.saddr = saddr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index a27782d7653e..6d9bf5106868 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + flow.flowi4_dscp = ip4h_dscp(iph); flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index ed08fb78cfa8..9a773502f10a 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -12,10 +12,10 @@ #include #include #include +#include #include #include #include -#include #include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include @@ -33,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, fl4.flowi4_oif = oif; fl4.daddr = gw->s_addr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 7e7c49535e3f..82af6cd76d13 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include #include @@ -114,7 +114,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (priv->flags & NFTA_FIB_F_MARK) fl4.flowi4_mark = pkt->skb->mark; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); if (priv->flags & NFTA_FIB_F_DADDR) { fl4.daddr = iph->daddr; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1f212b2ce4c6..771f6986ed05 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -84,6 +84,7 @@ #include #include #include +#include #include #include #include @@ -1291,7 +1292,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), + .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, @@ -2331,7 +2332,7 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; @@ -2690,7 +2691,6 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -3333,7 +3333,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index fce945f23069..54386e06a813 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -253,7 +254,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, fl4.saddr = key->u.ipv4.src; fl4.fl4_dport = dport; fl4.fl4_sport = sport; - fl4.flowi4_tos = tos & INET_DSCP_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(tos); fl4.flowi4_flags = key->flow_flags; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7fb6205619e7..58faf1ddd2b1 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -25,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = params->daddr->a4; - fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp); + fl4->flowi4_dscp = params->dscp; fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl4->flowi4_mark = params->mark; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 225ff293cd50..14dd1c0698c3 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -236,7 +236,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb))); + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 3b2373b3bd5d..9dbc24af749b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -437,7 +438,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4->flowi4_dscp = dscp; fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7111184eef59..62486f866975 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2594,7 +2594,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, static dscp_t xfrm_get_dscp(const struct flowi *fl, int family) { if (family == AF_INET) - return inet_dsfield_to_dscp(fl->u.ip4.flowi4_tos); + return fl->u.ip4.flowi4_dscp; return 0; } @@ -3462,7 +3462,7 @@ decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reve } fl4->flowi4_proto = flkeys->basic.ip_proto; - fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK; + fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos); } #if IS_ENABLED(CONFIG_IPV6) @@ -3594,7 +3594,7 @@ static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family, fl1->flowi_oif = fl->flowi_oif; fl1->flowi_mark = fl->flowi_mark; - fl1->flowi_tos = fl->flowi_tos; + fl1->flowi_dscp = fl->flowi_dscp; nf_nat_decode_session(newskb, fl1, family); ret = false; -- cgit v1.2.3 From 095928e7d80186c524013a5b5d54889fa2ec1eaa Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Aug 2025 21:36:43 -0400 Subject: ipv6: sr: Use HMAC-SHA1 and HMAC-SHA256 library functions Use the HMAC-SHA1 and HMAC-SHA256 library functions instead of crypto_shash. This is simpler and faster. Pre-allocating per-CPU hash transformation objects and descriptors is no longer needed, and a microbenchmark on x86_64 shows seg6_hmac_compute() (with HMAC-SHA256) dropping from ~2494 cycles to ~1978 cycles, a 20% improvement. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250824013644.71928-2-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/seg6_hmac.h | 12 --- net/ipv6/Kconfig | 7 +- net/ipv6/seg6.c | 7 -- net/ipv6/seg6_hmac.c | 207 +++++++----------------------------------------- 4 files changed, 30 insertions(+), 203 deletions(-) (limited to 'include/net') diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 24f733b3e3fe..3fe4123dbbf0 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -19,7 +19,6 @@ #include #include -#define SEG6_HMAC_MAX_DIGESTSIZE 160 #define SEG6_HMAC_RING_SIZE 256 struct seg6_hmac_info { @@ -32,13 +31,6 @@ struct seg6_hmac_info { u8 alg_id; }; -struct seg6_hmac_algo { - u8 alg_id; - char name[64]; - struct crypto_shash * __percpu *tfms; - struct shash_desc * __percpu *shashs; -}; - extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output); @@ -50,13 +42,9 @@ extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr, struct ipv6_sr_hdr *srh); extern bool seg6_hmac_validate_skb(struct sk_buff *skb); #ifdef CONFIG_IPV6_SEG6_HMAC -extern int seg6_hmac_init(void); -extern void seg6_hmac_exit(void); extern int seg6_hmac_net_init(struct net *net); extern void seg6_hmac_net_exit(struct net *net); #else -static inline int seg6_hmac_init(void) { return 0; } -static inline void seg6_hmac_exit(void) {} static inline int seg6_hmac_net_init(struct net *net) { return 0; } static inline void seg6_hmac_net_exit(struct net *net) {} #endif diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 1c9c686d9522..b8f9a8c0302e 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -304,10 +304,9 @@ config IPV6_SEG6_LWTUNNEL config IPV6_SEG6_HMAC bool "IPv6: Segment Routing HMAC support" depends on IPV6 - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS help Support for HMAC signature generation and verification of SR-enabled packets. diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 180da19c148c..a5c4c629b788 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -522,16 +522,10 @@ int __init seg6_init(void) if (err) goto out_unregister_iptun; - err = seg6_hmac_init(); - if (err) - goto out_unregister_seg6; - pr_info("Segment Routing with IPv6\n"); out: return err; -out_unregister_seg6: - seg6_local_exit(); out_unregister_iptun: seg6_iptunnel_exit(); out_unregister_genl: @@ -543,7 +537,6 @@ out_unregister_pernet: void seg6_exit(void) { - seg6_hmac_exit(); seg6_local_exit(); seg6_iptunnel_exit(); genl_unregister_family(&seg6_genl_family); diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index fd58426f222b..61f6019df55b 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -34,7 +33,8 @@ #include #include -#include +#include +#include #include #include #include @@ -78,17 +78,6 @@ static const struct rhashtable_params rht_params = { .obj_cmpfn = seg6_hmac_cmpfn, }; -static struct seg6_hmac_algo hmac_algos[] = { - { - .alg_id = SEG6_HMAC_ALGO_SHA1, - .name = "hmac(sha1)", - }, - { - .alg_id = SEG6_HMAC_ALGO_SHA256, - .name = "hmac(sha256)", - }, -}; - static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) { struct sr6_tlv_hmac *tlv; @@ -108,75 +97,13 @@ static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) return tlv; } -static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id) -{ - struct seg6_hmac_algo *algo; - int i, alg_count; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - if (algo->alg_id == alg_id) - return algo; - } - - return NULL; -} - -static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize, - u8 *output, int outlen) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int ret, dgsize; - - algo = __hmac_get_algo(hinfo->alg_id); - if (!algo) - return -ENOENT; - - tfm = *this_cpu_ptr(algo->tfms); - - dgsize = crypto_shash_digestsize(tfm); - if (dgsize > outlen) { - pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n", - dgsize, outlen); - return -ENOMEM; - } - - ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret); - goto failed; - } - - shash = *this_cpu_ptr(algo->shashs); - shash->tfm = tfm; - - ret = crypto_shash_digest(shash, text, psize, output); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret); - goto failed; - } - - return dgsize; - -failed: - return ret; -} - int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output) { __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); - u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE]; - int plen, i, dgsize, wrsize; + int plen, i, ret = 0; char *ring, *off; - /* a 160-byte buffer for digest output allows to store highest known - * hash function (RadioGatun) with up to 1216 bits - */ - /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; @@ -219,22 +146,26 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, off += 16; } - dgsize = __do_hmac(hinfo, ring, plen, tmp_out, - SEG6_HMAC_MAX_DIGESTSIZE); + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + hmac_sha1_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, + output); + static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); + memset(&output[SHA1_DIGEST_SIZE], 0, + SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); + break; + case SEG6_HMAC_ALGO_SHA256: + hmac_sha256_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, + output); + static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); + break; + default: + ret = -ENOENT; + break; + } local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); - - if (dgsize < 0) - return dgsize; - - wrsize = SEG6_HMAC_FIELD_LEN; - if (wrsize > dgsize) - wrsize = dgsize; - - memset(output, 0, SEG6_HMAC_FIELD_LEN); - memcpy(output, tmp_out, wrsize); - - return 0; + return ret; } EXPORT_SYMBOL(seg6_hmac_compute); @@ -305,8 +236,13 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) struct seg6_pernet_data *sdata = seg6_pernet(net); int err; - if (!__hmac_get_algo(hinfo->alg_id)) + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + case SEG6_HMAC_ALGO_SHA256: + break; + default: return -EINVAL; + } err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); @@ -363,65 +299,6 @@ out: } EXPORT_SYMBOL(seg6_push_hmac); -static int seg6_hmac_init_algo(void) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - int ret = -ENOMEM; - - alg_count = ARRAY_SIZE(hmac_algos); - - for (i = 0; i < alg_count; i++) { - struct crypto_shash **p_tfm; - int shsize; - - algo = &hmac_algos[i]; - algo->tfms = alloc_percpu(struct crypto_shash *); - if (!algo->tfms) - goto error_out; - - for_each_possible_cpu(cpu) { - tfm = crypto_alloc_shash(algo->name, 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto error_out; - } - p_tfm = per_cpu_ptr(algo->tfms, cpu); - *p_tfm = tfm; - } - - p_tfm = raw_cpu_ptr(algo->tfms); - tfm = *p_tfm; - - shsize = sizeof(*shash) + crypto_shash_descsize(tfm); - - algo->shashs = alloc_percpu(struct shash_desc *); - if (!algo->shashs) - goto error_out; - - for_each_possible_cpu(cpu) { - shash = kzalloc_node(shsize, GFP_KERNEL, - cpu_to_node(cpu)); - if (!shash) - goto error_out; - *per_cpu_ptr(algo->shashs, cpu) = shash; - } - } - - return 0; - -error_out: - seg6_hmac_exit(); - return ret; -} - -int __init seg6_hmac_init(void) -{ - return seg6_hmac_init_algo(); -} - int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); @@ -429,36 +306,6 @@ int __net_init seg6_hmac_net_init(struct net *net) return rhashtable_init(&sdata->hmac_infos, &rht_params); } -void seg6_hmac_exit(void) -{ - struct seg6_hmac_algo *algo = NULL; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - - if (algo->shashs) { - for_each_possible_cpu(cpu) { - shash = *per_cpu_ptr(algo->shashs, cpu); - kfree(shash); - } - free_percpu(algo->shashs); - } - - if (algo->tfms) { - for_each_possible_cpu(cpu) { - tfm = *per_cpu_ptr(algo->tfms, cpu); - crypto_free_shash(tfm); - } - free_percpu(algo->tfms); - } - } -} -EXPORT_SYMBOL(seg6_hmac_exit); - void __net_exit seg6_hmac_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); -- cgit v1.2.3 From fe60065689048edf4df99fffdb180a2166f9a54d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Aug 2025 21:36:44 -0400 Subject: ipv6: sr: Prepare HMAC key ahead of time Prepare the HMAC key when it is added to the kernel, instead of preparing it implicitly for every packet. This significantly improves the performance of seg6_hmac_compute(). A microbenchmark on x86_64 shows seg6_hmac_compute() (with HMAC-SHA256) dropping from ~1978 cycles to ~1419 cycles, a 28% improvement. The size of 'struct seg6_hmac_info' increases by 128 bytes, but that should be fine, since there should not be a massive number of keys. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250824013644.71928-3-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/seg6_hmac.h | 8 ++++++++ net/ipv6/seg6_hmac.c | 14 +++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 3fe4123dbbf0..e9f41725933e 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -9,6 +9,8 @@ #ifndef _NET_SEG6_HMAC_H #define _NET_SEG6_HMAC_H +#include +#include #include #include #include @@ -26,9 +28,15 @@ struct seg6_hmac_info { struct rcu_head rcu; u32 hmackeyid; + /* The raw key, kept only so it can be returned back to userspace */ char secret[SEG6_HMAC_SECRET_LEN]; u8 slen; u8 alg_id; + /* The prepared key, which the calculations actually use */ + union { + struct hmac_sha1_key sha1; + struct hmac_sha256_key sha256; + } key; }; extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo, diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 61f6019df55b..ee6bac0160ac 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -148,19 +148,18 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: - hmac_sha1_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, - output); + hmac_sha1(&hinfo->key.sha1, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); memset(&output[SHA1_DIGEST_SIZE], 0, SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); break; case SEG6_HMAC_ALGO_SHA256: - hmac_sha256_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, - output); + hmac_sha256(&hinfo->key.sha256, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); break; default: - ret = -ENOENT; + WARN_ON_ONCE(1); + ret = -EINVAL; break; } local_unlock_nested_bh(&hmac_storage.bh_lock); @@ -238,7 +237,12 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: + hmac_sha1_preparekey(&hinfo->key.sha1, + hinfo->secret, hinfo->slen); + break; case SEG6_HMAC_ALGO_SHA256: + hmac_sha256_preparekey(&hinfo->key.sha256, + hinfo->secret, hinfo->slen); break; default: return -EINVAL; -- cgit v1.2.3 From dcb34659028f856c423a29ef9b4e2571d203444d Mon Sep 17 00:00:00 2001 From: Takamitsu Iwai Date: Sat, 23 Aug 2025 17:58:55 +0900 Subject: net: rose: split remove and free operations in rose_remove_neigh() The current rose_remove_neigh() performs two distinct operations: 1. Removes rose_neigh from rose_neigh_list 2. Frees the rose_neigh structure Split these operations into separate functions to improve maintainability and prepare for upcoming refcount_t conversion. The timer cleanup remains in rose_remove_neigh() because free operations can be called from timer itself. This patch introduce rose_neigh_put() to handle the freeing of rose_neigh structures and modify rose_remove_neigh() to handle removal only. Signed-off-by: Takamitsu Iwai Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250823085857.47674-2-takamitz@amazon.co.jp Signed-off-by: Jakub Kicinski --- include/net/rose.h | 8 ++++++++ net/rose/rose_route.c | 15 ++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/rose.h b/include/net/rose.h index 23267b4efcfa..174b4f605d84 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -151,6 +151,14 @@ struct rose_sock { #define rose_sk(sk) ((struct rose_sock *)(sk)) +static inline void rose_neigh_put(struct rose_neigh *rose_neigh) +{ + if (rose_neigh->ax25) + ax25_cb_put(rose_neigh->ax25); + kfree(rose_neigh->digipeat); + kfree(rose_neigh); +} + /* af_rose.c */ extern ax25_address rose_callsign; extern int sysctl_rose_restart_request_timeout; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index b72bf8a08d48..0c44c416f485 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -234,20 +234,12 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) if ((s = rose_neigh_list) == rose_neigh) { rose_neigh_list = rose_neigh->next; - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); return; } while (s != NULL && s->next != NULL) { if (s->next == rose_neigh) { s->next = rose_neigh->next; - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); return; } @@ -331,8 +323,10 @@ static int rose_del_node(struct rose_route_struct *rose_route, if (rose_node->neighbour[i] == rose_neigh) { rose_neigh->count--; - if (rose_neigh->count == 0 && rose_neigh->use == 0) + if (rose_neigh->count == 0 && rose_neigh->use == 0) { rose_remove_neigh(rose_neigh); + rose_neigh_put(rose_neigh); + } rose_node->count--; @@ -513,6 +507,7 @@ void rose_rt_device_down(struct net_device *dev) } rose_remove_neigh(s); + rose_neigh_put(s); } spin_unlock_bh(&rose_neigh_list_lock); spin_unlock_bh(&rose_node_list_lock); @@ -569,6 +564,7 @@ static int rose_clear_routes(void) if (s->use == 0 && !s->loopback) { s->count = 0; rose_remove_neigh(s); + rose_neigh_put(s); } } @@ -1301,6 +1297,7 @@ void __exit rose_rt_free(void) rose_neigh = rose_neigh->next; rose_remove_neigh(s); + rose_neigh_put(s); } while (rose_node != NULL) { -- cgit v1.2.3 From d860d1faa6b2ce3becfdb8b0c2b048ad31800061 Mon Sep 17 00:00:00 2001 From: Takamitsu Iwai Date: Sat, 23 Aug 2025 17:58:56 +0900 Subject: net: rose: convert 'use' field to refcount_t The 'use' field in struct rose_neigh is used as a reference counter but lacks atomicity. This can lead to race conditions where a rose_neigh structure is freed while still being referenced by other code paths. For example, when rose_neigh->use becomes zero during an ioctl operation via rose_rt_ioctl(), the structure may be removed while its timer is still active, potentially causing use-after-free issues. This patch changes the type of 'use' from unsigned short to refcount_t and updates all code paths to use rose_neigh_hold() and rose_neigh_put() which operate reference counts atomically. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Takamitsu Iwai Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250823085857.47674-3-takamitz@amazon.co.jp Signed-off-by: Jakub Kicinski --- include/net/rose.h | 18 +++++++++++++----- net/rose/af_rose.c | 13 +++++++------ net/rose/rose_in.c | 12 ++++++------ net/rose/rose_route.c | 33 ++++++++++++++++++--------------- net/rose/rose_timer.c | 2 +- 5 files changed, 45 insertions(+), 33 deletions(-) (limited to 'include/net') diff --git a/include/net/rose.h b/include/net/rose.h index 174b4f605d84..2b5491bbf39a 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -8,6 +8,7 @@ #ifndef _ROSE_H #define _ROSE_H +#include #include #include #include @@ -96,7 +97,7 @@ struct rose_neigh { ax25_cb *ax25; struct net_device *dev; unsigned short count; - unsigned short use; + refcount_t use; unsigned int number; char restarted; char dce_mode; @@ -151,12 +152,19 @@ struct rose_sock { #define rose_sk(sk) ((struct rose_sock *)(sk)) +static inline void rose_neigh_hold(struct rose_neigh *rose_neigh) +{ + refcount_inc(&rose_neigh->use); +} + static inline void rose_neigh_put(struct rose_neigh *rose_neigh) { - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); + if (refcount_dec_and_test(&rose_neigh->use)) { + if (rose_neigh->ax25) + ax25_cb_put(rose_neigh->ax25); + kfree(rose_neigh->digipeat); + kfree(rose_neigh); + } } /* af_rose.c */ diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 4e72b636a46a..543f9e8ebb69 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -170,7 +170,7 @@ void rose_kill_by_neigh(struct rose_neigh *neigh) if (rose->neighbour == neigh) { rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose->neighbour = NULL; } } @@ -212,7 +212,7 @@ start: if (rose->device == dev) { rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); if (rose->neighbour) - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); netdev_put(rose->device, &rose->dev_tracker); rose->device = NULL; } @@ -655,7 +655,7 @@ static int rose_release(struct socket *sock) break; case ROSE_STATE_2: - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); release_sock(sk); rose_disconnect(sk, 0, -1, -1); lock_sock(sk); @@ -823,6 +823,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le rose->lci = rose_new_lci(rose->neighbour); if (!rose->lci) { err = -ENETUNREACH; + rose_neigh_put(rose->neighbour); goto out_release; } @@ -834,12 +835,14 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le dev = rose_dev_first(); if (!dev) { err = -ENETUNREACH; + rose_neigh_put(rose->neighbour); goto out_release; } user = ax25_findbyuid(current_euid()); if (!user) { err = -EINVAL; + rose_neigh_put(rose->neighbour); dev_put(dev); goto out_release; } @@ -874,8 +877,6 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le rose->state = ROSE_STATE_1; - rose->neighbour->use++; - rose_write_internal(sk, ROSE_CALL_REQUEST); rose_start_heartbeat(sk); rose_start_t1timer(sk); @@ -1077,7 +1078,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros GFP_ATOMIC); make_rose->facilities = facilities; - make_rose->neighbour->use++; + rose_neigh_hold(make_rose->neighbour); if (rose_sk(sk)->defer) { make_rose->state = ROSE_STATE_5; diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 3e99181e759f..0276b393f0e5 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -56,7 +56,7 @@ static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -79,12 +79,12 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; case ROSE_CLEAR_CONFIRMATION: rose_disconnect(sk, 0, -1, -1); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -121,7 +121,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; case ROSE_RR: @@ -234,7 +234,7 @@ static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -254,7 +254,7 @@ static int rose_state5_machine(struct sock *sk, struct sk_buff *skb, int framety if (frametype == ROSE_CLEAR_REQUEST) { rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose_sk(sk)->neighbour->use--; + rose_neigh_put(rose_sk(sk)->neighbour); } return 0; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 0c44c416f485..8efb9033c057 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -93,11 +93,11 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, rose_neigh->ax25 = NULL; rose_neigh->dev = dev; rose_neigh->count = 0; - rose_neigh->use = 0; rose_neigh->dce_mode = 0; rose_neigh->loopback = 0; rose_neigh->number = rose_neigh_no++; rose_neigh->restarted = 0; + refcount_set(&rose_neigh->use, 1); skb_queue_head_init(&rose_neigh->queue); @@ -255,10 +255,10 @@ static void rose_remove_route(struct rose_route *rose_route) struct rose_route *s; if (rose_route->neigh1 != NULL) - rose_route->neigh1->use--; + rose_neigh_put(rose_route->neigh1); if (rose_route->neigh2 != NULL) - rose_route->neigh2->use--; + rose_neigh_put(rose_route->neigh2); if ((s = rose_route_list) == rose_route) { rose_route_list = rose_route->next; @@ -323,7 +323,7 @@ static int rose_del_node(struct rose_route_struct *rose_route, if (rose_node->neighbour[i] == rose_neigh) { rose_neigh->count--; - if (rose_neigh->count == 0 && rose_neigh->use == 0) { + if (rose_neigh->count == 0) { rose_remove_neigh(rose_neigh); rose_neigh_put(rose_neigh); } @@ -375,11 +375,11 @@ void rose_add_loopback_neigh(void) sn->ax25 = NULL; sn->dev = NULL; sn->count = 0; - sn->use = 0; sn->dce_mode = 1; sn->loopback = 1; sn->number = rose_neigh_no++; sn->restarted = 1; + refcount_set(&sn->use, 1); skb_queue_head_init(&sn->queue); @@ -561,8 +561,7 @@ static int rose_clear_routes(void) s = rose_neigh; rose_neigh = rose_neigh->next; - if (s->use == 0 && !s->loopback) { - s->count = 0; + if (!s->loopback) { rose_remove_neigh(s); rose_neigh_put(s); } @@ -680,6 +679,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (node->neighbour[i]->restarted) { res = node->neighbour[i]; + rose_neigh_hold(node->neighbour[i]); goto out; } } @@ -691,6 +691,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (!rose_ftimer_running(node->neighbour[i])) { res = node->neighbour[i]; + rose_neigh_hold(node->neighbour[i]); goto out; } failed = 1; @@ -780,13 +781,13 @@ static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh) } if (rose_route->neigh1 == rose_neigh) { - rose_route->neigh1->use--; + rose_neigh_put(rose_route->neigh1); rose_route->neigh1 = NULL; rose_transmit_clear_request(rose_route->neigh2, rose_route->lci2, ROSE_OUT_OF_ORDER, 0); } if (rose_route->neigh2 == rose_neigh) { - rose_route->neigh2->use--; + rose_neigh_put(rose_route->neigh2); rose_route->neigh2 = NULL; rose_transmit_clear_request(rose_route->neigh1, rose_route->lci1, ROSE_OUT_OF_ORDER, 0); } @@ -915,7 +916,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_clear_queues(sk); rose->cause = ROSE_NETWORK_CONGESTION; rose->diagnostic = 0; - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose->neighbour = NULL; rose->lci = 0; rose->state = ROSE_STATE_0; @@ -1040,12 +1041,12 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) if ((new_lci = rose_new_lci(new_neigh)) == 0) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 71); - goto out; + goto put_neigh; } if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120); - goto out; + goto put_neigh; } rose_route->lci1 = lci; @@ -1058,8 +1059,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_route->lci2 = new_lci; rose_route->neigh2 = new_neigh; - rose_route->neigh1->use++; - rose_route->neigh2->use++; + rose_neigh_hold(rose_route->neigh1); + rose_neigh_hold(rose_route->neigh2); rose_route->next = rose_route_list; rose_route_list = rose_route; @@ -1071,6 +1072,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_transmit_link(skb, rose_route->neigh2); res = 1; +put_neigh: + rose_neigh_put(new_neigh); out: spin_unlock_bh(&rose_route_list_lock); spin_unlock_bh(&rose_neigh_list_lock); @@ -1186,7 +1189,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v) (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign), rose_neigh->dev ? rose_neigh->dev->name : "???", rose_neigh->count, - rose_neigh->use, + refcount_read(&rose_neigh->use) - 1, (rose_neigh->dce_mode) ? "DCE" : "DTE", (rose_neigh->restarted) ? "yes" : "no", ax25_display_timer(&rose_neigh->t0timer) / HZ, diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index 020369c49587..bb60a1654d61 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -180,7 +180,7 @@ static void rose_timer_expiry(struct timer_list *t) break; case ROSE_STATE_2: /* T3 */ - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose_disconnect(sk, ETIMEDOUT, -1, -1); break; -- cgit v1.2.3 From 97bcc5b6f45425ac56fb04b0893cdaa607ec7e45 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Mon, 25 Aug 2025 08:40:04 +0530 Subject: net: Prevent RPS table overwrite of active flows This patch fixes an issue where two different flows on the same RXq produce the same hash resulting in continuous flow overwrites. Flow #1: A packet for Flow #1 comes in, kernel calls the steering function. The driver gives back a filter id. The kernel saves this filter id in the selected slot. Later, the driver's service task checks if any filters have expired and then installs the rule for Flow #1. Flow #2: A packet for Flow #2 comes in. It goes through the same steps. But this time, the chosen slot is being used by Flow #1. The driver gives a new filter id and the kernel saves it in the same slot. When the driver's service task runs, it runs through all the flows, checks if Flow #1 should be expired, the kernel returns True as the slot has a different filter id, and then the driver installs the rule for Flow #2. Flow #1: Another packet for Flow #1 comes in. The same thing repeats. The slot is overwritten with a new filter id for Flow #1. This causes a repeated cycle of flow programming for missed packets, wasting CPU cycles while not improving performance. This problem happens at higher rates when the RPS table is small, but tests show it still happens even with 12,000 connections and an RPS size of 16K per queue (global table size = 144x16K = 64K). This patch prevents overwriting an rps_dev_flow entry if it is active. The intention is that it is better to do aRFS for the first flow instead of hurting all flows on the same hash. Without this, two (or more) flows on one RX queue with the same hash can keep overwriting each other. This causes the driver to reprogram the flow repeatedly. Changes: 1. Add a new 'hash' field to struct rps_dev_flow. 2. Add rps_flow_is_active(): a helper function to check if a flow is active or not, extracted from rps_may_expire_flow(). It is further simplified as per reviewer feedback. 3. In set_rps_cpu(): - Avoid overwriting by programming a new filter if: - The slot is not in use, or - The slot is in use but the flow is not active, or - The slot has an active flow with the same hash, but target CPU differs. - Save the hash in the rps_dev_flow entry. 4. rps_may_expire_flow(): Use earlier extracted rps_flow_is_active(). Testing & results: - Driver: ice (E810 NIC), Kernel: net-next - #CPUs = #RXq = 144 (1:1) - Number of flows: 12K - Eight RPS settings from 256 to 32768. Though RPS=256 is not ideal, it is still sufficient to cover 12K flows (256*144 rx-queues = 64K global table slots) - Global Table Size = 144 * RPS (effectively equal to 256 * RPS) - Each RPS test duration = 8 mins (org code) + 8 mins (new code). - Metrics captured on client Legend for following tables: Steer-C: #times ndo_rx_flow_steer() was Called by set_rps_cpu() Steer-L: #times ice_arfs_flow_steer() Looped over aRFS entries Add: #times driver actually programmed aRFS (ice_arfs_build_entry()) Del: #times driver deleted the flow (ice_arfs_del_flow_rules()) Units: K = 1,000 times, M = 1 million times |-------|---------|------| Org Code |---------|---------| | RPS | Latency | CPU | Add | Del | Steer-C | Steer-L | |-------|---------|------|--------|--------|---------|---------| | 256 | 227.0 | 93.2 | 1.6M | 1.6M | 121.7M | 267.6M | | 512 | 225.9 | 94.1 | 11.5M | 11.2M | 65.7M | 199.6M | | 1024 | 223.5 | 95.6 | 16.5M | 16.5M | 27.1M | 187.3M | | 2048 | 222.2 | 96.3 | 10.5M | 10.5M | 12.5M | 115.2M | | 4096 | 223.9 | 94.1 | 5.5M | 5.5M | 7.2M | 65.9M | | 8192 | 224.7 | 92.5 | 2.7M | 2.7M | 3.0M | 29.9M | | 16384 | 223.5 | 92.5 | 1.3M | 1.3M | 1.4M | 13.9M | | 32768 | 219.6 | 93.2 | 838.1K | 838.1K | 965.1K | 8.9M | |-------|---------|------| New Code |---------|---------| | 256 | 201.5 | 99.1 | 13.4K | 5.0K | 13.7K | 75.2K | | 512 | 202.5 | 98.2 | 11.2K | 5.9K | 11.2K | 55.5K | | 1024 | 207.3 | 93.9 | 11.5K | 9.7K | 11.5K | 59.6K | | 2048 | 207.5 | 96.7 | 11.8K | 11.1K | 15.5K | 79.3K | | 4096 | 206.9 | 96.6 | 11.8K | 11.7K | 11.8K | 63.2K | | 8192 | 205.8 | 96.7 | 11.9K | 11.8K | 11.9K | 63.9K | | 16384 | 200.9 | 98.2 | 11.9K | 11.9K | 11.9K | 64.2K | | 32768 | 202.5 | 98.0 | 11.9K | 11.9K | 11.9K | 64.2K | |-------|---------|------|--------|--------|---------|---------| Some observations: 1. Overall Latency improved: (1790.19-1634.94)/1790.19*100 = 8.67% 2. Overall CPU increased: (777.32-751.49)/751.45*100 = 3.44% 3. Flow Management (add/delete) remained almost constant at ~11K compared to values in millions. Signed-off-by: Krishna Kumar Link: https://patch.msgid.link/20250825031005.3674864-2-krikku@gmail.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 7 ++++-- net/core/dev.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++------ net/core/net-sysfs.c | 4 +++- 3 files changed, 65 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/rps.h b/include/net/rps.h index d8ab3a08bcc4..9917dce42ca4 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -25,13 +25,16 @@ struct rps_map { /* * The rps_dev_flow structure contains the mapping of a flow to a CPU, the - * tail pointer for that CPU's input queue at the time of last enqueue, and - * a hardware filter index. + * tail pointer for that CPU's input queue at the time of last enqueue, a + * hardware filter index, and the hash of the flow if aRFS is enabled. */ struct rps_dev_flow { u16 cpu; u16 filter; unsigned int last_qtail; +#ifdef CONFIG_RFS_ACCEL + u32 hash; +#endif }; #define RPS_NO_FILTER 0xffff diff --git a/net/core/dev.c b/net/core/dev.c index 93a25d87b86b..fbf0894c55d0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4849,6 +4849,36 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) return hash_32(hash, flow_table->log); } +#ifdef CONFIG_RFS_ACCEL +/** + * rps_flow_is_active - check whether the flow is recently active. + * @rflow: Specific flow to check activity. + * @flow_table: per-queue flowtable that @rflow belongs to. + * @cpu: CPU saved in @rflow. + * + * If the CPU has processed many packets since the flow's last activity + * (beyond 10 times the table size), the flow is considered stale. + * + * Return: true if flow was recently active. + */ +static bool rps_flow_is_active(struct rps_dev_flow *rflow, + struct rps_dev_flow_table *flow_table, + unsigned int cpu) +{ + unsigned int flow_last_active; + unsigned int sd_input_head; + + if (cpu >= nr_cpu_ids) + return false; + + sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head); + flow_last_active = READ_ONCE(rflow->last_qtail); + + return (int)(sd_input_head - flow_last_active) < + (int)(10 << flow_table->log); +} +#endif + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) @@ -4859,8 +4889,11 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; + struct rps_dev_flow *tmp_rflow; + unsigned int tmp_cpu; u16 rxq_index; u32 flow_id; + u32 hash; int rc; /* Should we steer this flow to a different hardware queue? */ @@ -4875,14 +4908,32 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = rfs_slot(skb_get_hash(skb), flow_table); + + hash = skb_get_hash(skb); + flow_id = rfs_slot(hash, flow_table); + + tmp_rflow = &flow_table->flows[flow_id]; + tmp_cpu = READ_ONCE(tmp_rflow->cpu); + + if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { + if (rps_flow_is_active(tmp_rflow, flow_table, + tmp_cpu)) { + if (hash != READ_ONCE(tmp_rflow->hash) || + next_cpu == tmp_cpu) + goto out; + } + } + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; + old_rflow = rflow; - rflow = &flow_table->flows[flow_id]; + rflow = tmp_rflow; WRITE_ONCE(rflow->filter, rc); + WRITE_ONCE(rflow->hash, hash); + if (old_rflow->filter == rc) WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: @@ -5017,17 +5068,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id < (1UL << flow_table->log)) { + unsigned int cpu; + rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); - if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && - ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - - READ_ONCE(rflow->last_qtail)) < - (int)(10 << flow_table->log))) + if (READ_ONCE(rflow->filter) == filter_id && + rps_flow_is_active(rflow, flow_table, cpu)) expire = false; } rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c28cd6665444..5ea9f64adce3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1120,8 +1120,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return -ENOMEM; table->log = ilog2(mask) + 1; - for (count = 0; count <= mask; count++) + for (count = 0; count <= mask; count++) { table->flows[count].cpu = RPS_NO_CPU; + table->flows[count].filter = RPS_NO_FILTER; + } } else { table = NULL; } -- cgit v1.2.3 From f86f42ed2c471da5b061492bb8ab1d3d73c19c58 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:27 +0000 Subject: net: add sk_drops_read(), sk_drops_inc() and sk_drops_reset() helpers We want to split sk->sk_drops in the future to reduce potential contention on this field. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 17 ++++++++++++++++- include/net/tcp.h | 2 +- net/core/datagram.c | 2 +- net/core/sock.c | 14 +++++++------- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 6 +++--- net/ipv4/udp.c | 14 +++++++------- net/ipv6/datagram.c | 2 +- net/ipv6/raw.c | 8 ++++---- net/ipv6/udp.c | 6 +++--- net/iucv/af_iucv.c | 4 ++-- net/netlink/af_netlink.c | 4 ++-- net/packet/af_packet.c | 2 +- net/phonet/pep.c | 6 +++--- net/phonet/socket.c | 2 +- net/sctp/diag.c | 2 +- net/tipc/socket.c | 6 +++--- 17 files changed, 57 insertions(+), 42 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 63a6a48afb48..34d7029eb622 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2682,11 +2682,26 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_inc(struct sock *sk) +{ + atomic_inc(&sk->sk_drops); +} + +static inline int sk_drops_read(const struct sock *sk) +{ + return atomic_read(&sk->sk_drops); +} + +static inline void sk_drops_reset(struct sock *sk) +{ + atomic_set(&sk->sk_drops, 0); +} + static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? - atomic_read(&sk->sk_drops) : 0; + sk_drops_read(sk) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2936b8175950..16dc9cebb9d2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2612,7 +2612,7 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) */ static inline void tcp_listendrop(const struct sock *sk) { - atomic_inc(&((struct sock *)sk)->sk_drops); + sk_drops_inc((struct sock *)sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } diff --git a/net/core/datagram.c b/net/core/datagram.c index 94cc4705e91d..ba8253aa6e07 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -345,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, spin_unlock_bh(&sk_queue->lock); } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return err; } EXPORT_SYMBOL(__sk_queue_drop_skb); diff --git a/net/core/sock.c b/net/core/sock.c index 8002ac6293dc..75368823969a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -491,13 +491,13 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return -ENOBUFS; } @@ -562,7 +562,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, skb->dev = NULL; if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } @@ -585,7 +585,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, reason = SKB_DROP_REASON_PFMEMALLOC; if (err == -ENOBUFS) reason = SKB_DROP_REASON_SOCKET_BACKLOG; - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); goto discard_and_relse; } @@ -2505,7 +2505,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; - atomic_set(&newsk->sk_drops, 0); + sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); @@ -3713,7 +3713,7 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); - atomic_set(&sk->sk_drops, 0); + sk_drops_reset(sk); } EXPORT_SYMBOL(sock_init_data_uid); @@ -3973,7 +3973,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); } #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 031df4c19fcc..f119da68fc30 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1119,7 +1119,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 1d2c89d63cc7..0f9f02f6146e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -178,7 +178,7 @@ static int raw_v4_input(struct net *net, struct sk_buff *skb, if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -311,7 +311,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -1045,7 +1045,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), - refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); + refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp)); } static int raw_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cc3ce0f762ec..732bdad43626 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1787,7 +1787,7 @@ uncharge_drop: atomic_sub(skb->truesize, &sk->sk_rmem_alloc); drop: - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); busylock_release(busy); return err; } @@ -1852,7 +1852,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2008,7 +2008,7 @@ try_again: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } @@ -2078,7 +2078,7 @@ try_again: if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2449,7 +2449,7 @@ csum_error: __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2534,7 +2534,7 @@ start_lookup: nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, @@ -3386,7 +3386,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } int udp4_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 972bf0426d59..33ebe93d80e3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -1068,5 +1068,5 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4c3f8245c40f..4026192143ec 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -163,7 +163,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -361,7 +361,7 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } @@ -389,7 +389,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -414,7 +414,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6a68f77da44b..a35ee6d693a8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -524,7 +524,7 @@ try_again: } if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -908,7 +908,7 @@ csum_error: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -1013,7 +1013,7 @@ start_lookup: } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cc2b3c44bc05..6c717a7ef292 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1187,7 +1187,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return; } @@ -2011,7 +2011,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb) skb_reset_network_header(skb); IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return NET_RX_SUCCESS; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e2f7080dd5d7..2b46c0cd752a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -356,7 +356,7 @@ static void netlink_overrun(struct sock *sk) sk_error_report(sk); } } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); } static void netlink_rcv_wake(struct sock *sk) @@ -2711,7 +2711,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), - atomic_read(&s->sk_drops), + sk_drops_read(s), sock_i_ino(s) ); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a7017d7f0927..9d42c4bd6e39 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2265,7 +2265,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: atomic_inc(&po->tp_drops); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 62527e1ebb88..4db564d9d522 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -376,7 +376,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) case PNS_PEP_CTRL_REQ: if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); break; } __skb_pull(skb, 4); @@ -397,7 +397,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = -ENOBUFS; break; } @@ -567,7 +567,7 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = NET_RX_DROP; break; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 2b61a40b568e..db2d552e9b32 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -587,7 +587,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, - atomic_read(&sk->sk_drops)); + sk_drops_read(sk)); } seq_pad(seq, '\n'); return 0; diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 23359e522273..996c2018f0e6 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -173,7 +173,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) goto errout; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e028bf658499..1574a83384f8 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2366,7 +2366,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload2!"); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = TIPC_ERR_OVERLOAD; } @@ -2458,7 +2458,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!"); /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) { trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL, "@sk_enqueue!"); @@ -3657,7 +3657,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, skb_queue_len(&sk->sk_write_queue)) || nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP, - atomic_read(&sk->sk_drops))) + sk_drops_read(sk))) goto stat_msg_cancel; if (tsk->cong_link_cnt && -- cgit v1.2.3 From cb4d5a6eb600a43c2e3ec7f54e06d07aa33d8062 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:28 +0000 Subject: net: add sk_drops_skbadd() helper Existing sk_drops_add() helper is renamed to sk_drops_skbadd(). Add sk_drops_add() and convert sk_drops_inc() to use it. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-3-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/skmsg.h | 2 +- include/net/sock.h | 11 ++++++++--- include/net/udp.h | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- net/mptcp/protocol.c | 2 +- 7 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 0b9095a281b8..49847888c287 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -315,7 +315,7 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock, static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); kfree_skb(skb); } diff --git a/include/net/sock.h b/include/net/sock.h index 34d7029eb622..9edb42ff0622 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2682,9 +2682,14 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_add(struct sock *sk, int segs) +{ + atomic_add(segs, &sk->sk_drops); +} + static inline void sk_drops_inc(struct sock *sk) { - atomic_inc(&sk->sk_drops); + sk_drops_add(sk, 1); } static inline int sk_drops_read(const struct sock *sk) @@ -2704,11 +2709,11 @@ sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) sk_drops_read(sk) : 0; } -static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) +static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - atomic_add(segs, &sk->sk_drops); + sk_drops_add(sk, segs); } static inline ktime_t sock_read_timestamp(struct sock *sk) diff --git a/include/net/udp.h b/include/net/udp.h index e2af3bda90c9..7b26d4c50f33 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -627,7 +627,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; drop: - atomic_add(drop_count, &sk->sk_drops); + sk_drops_add(sk, drop_count); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count); kfree_skb(skb); return NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a52a747d8a55..f1be65af1a77 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4830,7 +4830,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, noinline_for_tracing static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); sk_skb_reason_drop(sk, skb, reason); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a0c93b24c6e0..7c1d612afca1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2254,7 +2254,7 @@ lookup: &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -2399,7 +2399,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8b2e7b7afbd8..b4e56b877273 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1809,7 +1809,7 @@ lookup: &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -1948,7 +1948,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f2e728239480..ad41c48126e4 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -137,7 +137,7 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); __kfree_skb(skb); } -- cgit v1.2.3 From c51613fa276f038bdd18656a57a90ccc5d4e5200 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:29 +0000 Subject: net: add sk->sk_drop_counters Some sockets suffer from heavy false sharing on sk->sk_drops, and fields in the same cache line. Add sk->sk_drop_counters to: - move the drop counter(s) to dedicated cache lines. - Add basic NUMA awareness to these drop counter(s). Following patches will use this infrastructure for UDP and RAW sockets. sk_clone_lock() is not yet ready, it would need to properly set newsk->sk_drop_counters if we plan to use this for TCP sockets. v2: used Paolo suggestion from https://lore.kernel.org/netdev/8f09830a-d83d-43c9-b36b-88ba0a23e9b2@redhat.com/ Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 32 +++++++++++++++++++++++++++++++- net/core/sock.c | 2 ++ 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 9edb42ff0622..73cd3316e288 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -102,6 +102,11 @@ struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; +struct socket_drop_counters { + atomic_t drops0 ____cacheline_aligned_in_smp; + atomic_t drops1 ____cacheline_aligned_in_smp; +}; + /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr @@ -282,6 +287,7 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter + * @sk_drop_counters: optional pointer to socket_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner @@ -449,6 +455,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif + struct socket_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -2684,7 +2691,18 @@ struct sock_skb_cb { static inline void sk_drops_add(struct sock *sk, int segs) { - atomic_add(segs, &sk->sk_drops); + struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + int n = numa_node_id() % 2; + + if (n) + atomic_add(segs, &sdc->drops1); + else + atomic_add(segs, &sdc->drops0); + } else { + atomic_add(segs, &sk->sk_drops); + } } static inline void sk_drops_inc(struct sock *sk) @@ -2694,11 +2712,23 @@ static inline void sk_drops_inc(struct sock *sk) static inline int sk_drops_read(const struct sock *sk) { + const struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); + return atomic_read(&sdc->drops0) + atomic_read(&sdc->drops1); + } return atomic_read(&sk->sk_drops); } static inline void sk_drops_reset(struct sock *sk) { + struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + atomic_set(&sdc->drops0, 0); + atomic_set(&sdc->drops1, 0); + } atomic_set(&sk->sk_drops, 0); } diff --git a/net/core/sock.c b/net/core/sock.c index 75368823969a..e66ad1ec3a2d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2505,6 +2505,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; + DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; @@ -4457,6 +4458,7 @@ static int __init sock_struct_check(void) #ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); #endif + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); -- cgit v1.2.3 From 51132b99f01ce05f8008f0fb189d83eed484bd53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:30 +0000 Subject: udp: add drop_counters to udp socket When a packet flood hits one or more UDP sockets, many cpus have to update sk->sk_drops. This slows down other cpus, because currently sk_drops is in sock_write_rx group. Add a socket_drop_counters structure to udp sockets. Using dedicated cache lines to hold drop counters makes sure that consumers no longer suffer from false sharing if/when producers only change sk->sk_drops. This adds 128 bytes per UDP socket. Tested with the following stress test, sending about 11 Mpps to a dual socket AMD EPYC 7B13 64-Core. super_netperf 20 -t UDP_STREAM -H DUT -l10 -- -n -P,1000 -m 120 Note: due to socket lookup, only one UDP socket is receiving packets on DUT. Then measure receiver (DUT) behavior. We can see both consumer and BH handlers can process more packets per second. Before: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 615091 0.0 Udp6InErrors 3904277 0.0 Udp6RcvbufErrors 3904277 0.0 After: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 816281 0.0 Udp6InErrors 7497093 0.0 Udp6RcvbufErrors 7497093 0.0 Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-5-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/udp.h | 1 + include/net/udp.h | 1 + tools/testing/selftests/bpf/progs/bpf_iter_udp4.c | 3 ++- tools/testing/selftests/bpf/progs/bpf_iter_udp6.c | 4 ++-- 4 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/linux/udp.h b/include/linux/udp.h index 4e1a672af4c5..981506be1e15 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,6 +108,7 @@ struct udp_sock { * the last UDP socket cacheline. */ struct hlist_node tunnel_list; + struct socket_drop_counters drop_counters; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index 7b26d4c50f33..93b159f30e88 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -288,6 +288,7 @@ static inline void udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); + sk->sk_drop_counters = &up->drop_counters; skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c index ffbd4b116d17..23b2aa2604de 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c @@ -64,7 +64,8 @@ int dump_udp4(struct bpf_iter__udp *ctx) 0, 0L, 0, ctx->uid, 0, sock_i_ino(&inet->sk), inet->sk.sk_refcnt.refs.counter, udp_sk, - inet->sk.sk_drops.counter); + udp_sk->drop_counters.drops0.counter + + udp_sk->drop_counters.drops1.counter); return 0; } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c index 47ff7754f4fd..c48b05aa2a4b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c @@ -72,7 +72,7 @@ int dump_udp6(struct bpf_iter__udp *ctx) 0, 0L, 0, ctx->uid, 0, sock_i_ino(&inet->sk), inet->sk.sk_refcnt.refs.counter, udp_sk, - inet->sk.sk_drops.counter); - + udp_sk->drop_counters.drops0.counter + + udp_sk->drop_counters.drops1.counter); return 0; } -- cgit v1.2.3 From b81aa23234d94d99951761d9864061d774633ba9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:31 +0000 Subject: inet: raw: add drop_counters to raw sockets When a packet flood hits one or more RAW sockets, many cpus have to update sk->sk_drops. This slows down other cpus, because currently sk_drops is in sock_write_rx group. Add a socket_drop_counters structure to raw sockets. Using dedicated cache lines to hold drop counters makes sure that consumers no longer suffer from false sharing if/when producers only change sk->sk_drops. This adds 128 bytes per RAW socket. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-6-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 2 +- include/net/raw.h | 1 + net/ipv4/raw.c | 1 + net/ipv6/raw.c | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index bc6ec2959173..261d02efb615 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -295,7 +295,7 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - + struct socket_drop_counters drop_counters; struct ipv6_pinfo inet6; }; diff --git a/include/net/raw.h b/include/net/raw.h index 32a61481a253..d52709139060 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,6 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; + struct socket_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0f9f02f6146e..d54ebb7df966 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -793,6 +793,7 @@ static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4026192143ec..4ae07a67b4d4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1175,6 +1175,7 @@ static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; -- cgit v1.2.3 From 13d8e05adf9dd06c74fcc6ba42ec4bf780fd557f Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 27 Aug 2025 17:39:55 +0300 Subject: queue_api: add support for fetching per queue DMA dev For zerocopy (io_uring, devmem), there is an assumption that the parent device can do DMA. However that is not always the case: - Scalable Function netdevs [1] have the DMA device in the grandparent. - For Multi-PF netdevs [2] queues can be associated to different DMA devices. This patch introduces the a queue based interface for allowing drivers to expose a different DMA device for zerocopy. [1] Documentation/networking/device_drivers/ethernet/mellanox/mlx5/switchdev.rst [2] Documentation/networking/multi-pf-netdev.rst Signed-off-by: Dragos Tatulea Reviewed-by: Pavel Begunkov Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250827144017.1529208-3-dtatulea@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/netdev_queues.h | 7 +++++++ net/core/Makefile | 1 + net/core/netdev_queues.c | 27 +++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) create mode 100644 net/core/netdev_queues.c (limited to 'include/net') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 6e835972abd1..b9d02bc65c97 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -127,6 +127,9 @@ void netdev_stat_queue_sum(struct net_device *netdev, * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped * queue's memory is written at the specified address. * + * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used + * for this queue. Return NULL on error. + * * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only * be called for an interface which is open. @@ -144,6 +147,8 @@ struct netdev_queue_mgmt_ops { int (*ndo_queue_stop)(struct net_device *dev, void *per_queue_mem, int idx); + struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, + int idx); }; /** @@ -321,4 +326,6 @@ static inline void netif_subqueue_sent(const struct net_device *dev, get_desc, start_thrs); \ }) +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); + #endif diff --git a/net/core/Makefile b/net/core/Makefile index b2a76ce33932..9ef2099c5426 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o obj-y += netdev_rx_queue.o +obj-y += netdev_queues.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c new file mode 100644 index 000000000000..251f27a8307f --- /dev/null +++ b/net/core/netdev_queues.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +/** + * netdev_queue_get_dma_dev() - get dma device for zero-copy operations + * @dev: net_device + * @idx: queue index + * + * Get dma device for zero-copy operations to be used for this queue. + * When such device is not available or valid, the function will return NULL. + * + * Return: Device or NULL on error + */ +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) +{ + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; + struct device *dma_dev; + + if (queue_ops && queue_ops->ndo_queue_get_dma_dev) + dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); + else + dma_dev = dev->dev.parent; + + return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; +} + -- cgit v1.2.3 From 48b5e5dbdb234ffc951cacceaec7f8ee37c83b2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 12:53:47 +0000 Subject: net_sched: act_vlan: use RCU in tcf_vlan_dump() Also storing tcf_action into struct tcf_vlan_params makes sure there is no discrepancy in tcf_vlan_act(). No longer block BH in tcf_vlan_init() when acquiring tcf_lock. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250827125349.3505302-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_vlan.h | 1 + net/sched/act_vlan.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 3f5e9242b5e8..beadee41669a 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -10,6 +10,7 @@ #include struct tcf_vlan_params { + int action; int tcfv_action; unsigned char tcfv_push_dst[ETH_ALEN]; unsigned char tcfv_push_src[ETH_ALEN]; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 383bf18b6862..b46f980f3b2a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -25,7 +25,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; - int action; int err; u16 tci; @@ -38,8 +37,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - action = READ_ONCE(v->tcf_action); - p = rcu_dereference_bh(v->vlan_p); switch (p->tcfv_action) { @@ -97,7 +94,7 @@ out: skb_pull_rcsum(skb, skb->mac_len); skb_reset_mac_len(skb); - return action; + return p->action; drop: tcf_action_inc_drop_qstats(&v->common); @@ -255,10 +252,11 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ETH_ALEN); } - spin_lock_bh(&v->tcf_lock); + p->action = parm->action; + spin_lock(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); - spin_unlock_bh(&v->tcf_lock); + spin_unlock(&v->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); @@ -297,9 +295,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&v->tcf_lock); - opt.action = v->tcf_action; - p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(v->vlan_p); + opt.action = p->action; opt.v_action = p->tcfv_action; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -325,12 +323,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; - spin_unlock_bh(&v->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&v->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From e97ae742972f6cb57986a5ebb846048f80b90003 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 12:53:48 +0000 Subject: net_sched: act_tunnel_key: use RCU in tunnel_key_dump() Also storing tcf_action into struct tcf_tunnel_key_params makes sure there is no discrepancy in tunnel_key_act(). No longer block BH in tunnel_key_init() when acquiring tcf_lock. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250827125349.3505302-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_tunnel_key.h | 1 + net/sched/act_tunnel_key.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index 879fe8cff581..0f1925f97520 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -14,6 +14,7 @@ struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; + int action; struct metadata_dst *tcft_enc_metadata; }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2cef4b08befb..e1c8b48c217c 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -29,13 +29,11 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; - int action; params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); tcf_action_update_bstats(&t->common, skb); - action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: @@ -51,7 +49,7 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, break; } - return action; + return params->action; } static const struct nla_policy @@ -532,11 +530,12 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; - spin_lock_bh(&t->tcf_lock); + params_new->action = parm->action; + spin_lock(&t->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(t->params, params_new, lockdep_is_held(&t->tcf_lock)); - spin_unlock_bh(&t->tcf_lock); + spin_unlock(&t->tcf_lock); tunnel_key_release_params(params_new); if (goto_ch) tcf_chain_put_by_act(goto_ch); @@ -726,10 +725,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t tm; - spin_lock_bh(&t->tcf_lock); - params = rcu_dereference_protected(t->params, - lockdep_is_held(&t->tcf_lock)); - opt.action = t->tcf_action; + rcu_read_lock(); + params = rcu_dereference(t->params); + opt.action = params->action; opt.t_action = params->tcft_action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) @@ -766,12 +764,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; - spin_unlock_bh(&t->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&t->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 53df77e7859042a92914d664c860f65d9689f88d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 12:53:49 +0000 Subject: net_sched: act_skbmod: use RCU in tcf_skbmod_dump() Also storing tcf_action into struct tcf_skbmod_params makes sure there is no discrepancy in tcf_skbmod_act(). No longer block BH in tcf_skbmod_init() when acquiring tcf_lock. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250827125349.3505302-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_skbmod.h | 1 + net/sched/act_skbmod.c | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h index 7c240d2fed4e..626704cd6241 100644 --- a/include/net/tc_act/tc_skbmod.h +++ b/include/net/tc_act/tc_skbmod.h @@ -12,6 +12,7 @@ struct tcf_skbmod_params { struct rcu_head rcu; u64 flags; /*up to 64 types of operations; extend if needed */ + int action; u8 eth_dst[ETH_ALEN]; u16 eth_type; u8 eth_src[ETH_ALEN]; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index dc0229693461..fce625eafcb2 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -27,19 +27,18 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); - int action, max_edit_len, err; struct tcf_skbmod_params *p; + int max_edit_len, err; u64 flags; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); - action = READ_ONCE(d->tcf_action); - if (unlikely(action == TC_ACT_SHOT)) + p = rcu_dereference_bh(d->skbmod_p); + if (unlikely(p->action == TC_ACT_SHOT)) goto drop; max_edit_len = skb_mac_header_len(skb); - p = rcu_dereference_bh(d->skbmod_p); flags = p->flags; /* tcf_skbmod_init() guarantees "flags" to be one of the following: @@ -85,7 +84,7 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, INET_ECN_set_ce(skb); out: - return action; + return p->action; drop: qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); @@ -193,9 +192,9 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, } p->flags = lflags; - + p->action = parm->action; if (ovr) - spin_lock_bh(&d->tcf_lock); + spin_lock(&d->tcf_lock); /* Protected by tcf_lock if overwriting existing action. */ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p_old = rcu_dereference_protected(d->skbmod_p, 1); @@ -209,7 +208,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, rcu_assign_pointer(d->skbmod_p, p); if (ovr) - spin_unlock_bh(&d->tcf_lock); + spin_unlock(&d->tcf_lock); if (p_old) kfree_rcu(p_old, rcu); @@ -248,10 +247,9 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, opt.index = d->tcf_index; opt.refcnt = refcount_read(&d->tcf_refcnt) - ref; opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind; - spin_lock_bh(&d->tcf_lock); - opt.action = d->tcf_action; - p = rcu_dereference_protected(d->skbmod_p, - lockdep_is_held(&d->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(d->skbmod_p); + opt.action = p->action; opt.flags = p->flags; if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -269,10 +267,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) goto nla_put_failure; - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From caedcc5b6df1b2e2b5f39079e3369c1d4d5c5f50 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:16 +0000 Subject: net: dst: introduce dst->dev_rcu Followup of commit 88fe14253e18 ("net: dst: add four helpers to annotate data-races around dst->dev"). We want to gradually add explicit RCU protection to dst->dev, including lockdep support. Add an union to alias dst->dev_rcu and dst->dev. Add dst_dev_net_rcu() helper. Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 16 +++++++++++----- net/core/dst.c | 2 +- net/ipv4/route.c | 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index bab01363bb97..f8aa1239b4db 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -24,7 +24,10 @@ struct sk_buff; struct dst_entry { - struct net_device *dev; + union { + struct net_device *dev; + struct net_device __rcu *dev_rcu; + }; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; @@ -570,9 +573,12 @@ static inline struct net_device *dst_dev(const struct dst_entry *dst) static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) { - /* In the future, use rcu_dereference(dst->dev) */ - WARN_ON_ONCE(!rcu_read_lock_held()); - return READ_ONCE(dst->dev); + return rcu_dereference(dst->dev_rcu); +} + +static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst) +{ + return dev_net_rcu(dst_dev_rcu(dst)); } static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) @@ -592,7 +598,7 @@ static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) { - return dev_net_rcu(skb_dst_dev(skb)); + return dev_net_rcu(skb_dst_dev_rcu(skb)); } struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); diff --git a/net/core/dst.c b/net/core/dst.c index e2de8b68c41d..e9d35f49c9e7 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev); WRITE_ONCE(dst->input, dst_discard); WRITE_ONCE(dst->output, dst_discard_out); - WRITE_ONCE(dst->dev, blackhole_netdev); + rcu_assign_pointer(dst->dev_rcu, blackhole_netdev); netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cc86a917a1bb..44382d175589 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1027,7 +1027,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1327,7 +1327,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); -- cgit v1.2.3 From 99a2ace61b211b0be861b07fbaa062fca4b58879 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:20 +0000 Subject: net: use dst_dev_rcu() in sk_setup_caps() Use RCU to protect accesses to dst->dev from sk_setup_caps() and sk_dst_gso_max_size(). Also use dst_dev_rcu() in ip6_dst_mtu_maybe_forward(), and ip_dst_mtu_maybe_forward(). ip4_dst_hoplimit() can use dst_dev_net_rcu(). Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 6 ++++-- include/net/ip6_route.h | 2 +- include/net/route.h | 2 +- net/core/sock.c | 16 ++++++++++------ 4 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index befcba575129..6dbd2bf8fa9c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -467,12 +467,14 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = dst_rtable(dst); + const struct net_device *dev; unsigned int mtu, res; struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + dev = dst_dev_rcu(dst); + net = dev_net_rcu(dev); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { @@ -486,7 +488,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, if (mtu) goto out; - mtu = READ_ONCE(dst_dev(dst)->mtu); + mtu = READ_ONCE(dev->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9255f21818ee..59f48ca3abdf 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -337,7 +337,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst mtu = IPV6_MIN_MTU; rcu_read_lock(); - idev = __in6_dev_get(dst_dev(dst)); + idev = __in6_dev_get(dst_dev_rcu(dst)); if (idev) mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); diff --git a/include/net/route.h b/include/net/route.h index c71998f464f8..f90106f383c5 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -390,7 +390,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) const struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); rcu_read_unlock(); } diff --git a/net/core/sock.c b/net/core/sock.c index e66ad1ec3a2d..9a8290fcc35d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2587,7 +2587,7 @@ free: } EXPORT_SYMBOL_GPL(sk_clone_lock); -static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) +static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) { bool is_ipv6 = false; u32 max_size; @@ -2597,8 +2597,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ - max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) : - READ_ONCE(dst_dev(dst)->gso_ipv4_max_size); + max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : + READ_ONCE(dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; @@ -2607,9 +2607,12 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + const struct net_device *dev; u32 max_segs = 1; - sk->sk_route_caps = dst_dev(dst)->features; + rcu_read_lock(); + dev = dst_dev_rcu(dst); + sk->sk_route_caps = dev->features; if (sk_is_tcp(sk)) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2625,13 +2628,14 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ - max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1); + max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(sk_setup_caps); -- cgit v1.2.3 From edd3cb05c00a040dc72bed20b14b5ba865188bce Mon Sep 17 00:00:00 2001 From: Simon Schuster Date: Mon, 1 Sep 2025 15:09:51 +0200 Subject: copy_process: pass clone_flags as u64 across calltree With the introduction of clone3 in commit 7f192e3cd316 ("fork: add clone3") the effective bit width of clone_flags on all architectures was increased from 32-bit to 64-bit, with a new type of u64 for the flags. However, for most consumers of clone_flags the interface was not changed from the previous type of unsigned long. While this works fine as long as none of the new 64-bit flag bits (CLONE_CLEAR_SIGHAND and CLONE_INTO_CGROUP) are evaluated, this is still undesirable in terms of the principle of least surprise. Thus, this commit fixes all relevant interfaces of callees to sys_clone3/copy_process (excluding the architecture-specific copy_thread) to consistently pass clone_flags as u64, so that no truncation to 32-bit integers occurs on 32-bit architectures. Signed-off-by: Simon Schuster Link: https://lore.kernel.org/20250901-nios2-implement-clone3-v2-2-53fcf5577d57@siemens-energy.com Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Signed-off-by: Christian Brauner --- block/blk-ioc.c | 2 +- fs/namespace.c | 2 +- include/linux/cgroup.h | 4 ++-- include/linux/cred.h | 2 +- include/linux/iocontext.h | 6 +++--- include/linux/ipc_namespace.h | 4 ++-- include/linux/lsm_hook_defs.h | 2 +- include/linux/mnt_namespace.h | 2 +- include/linux/nsproxy.h | 2 +- include/linux/pid_namespace.h | 4 ++-- include/linux/rseq.h | 4 ++-- include/linux/sched/task.h | 2 +- include/linux/security.h | 4 ++-- include/linux/sem.h | 4 ++-- include/linux/time_namespace.h | 4 ++-- include/linux/uprobes.h | 4 ++-- include/linux/user_events.h | 4 ++-- include/linux/utsname.h | 4 ++-- include/net/net_namespace.h | 4 ++-- include/trace/events/task.h | 6 +++--- ipc/namespace.c | 2 +- ipc/sem.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/cred.c | 2 +- kernel/events/uprobes.c | 2 +- kernel/fork.c | 8 ++++---- kernel/nsproxy.c | 4 ++-- kernel/pid_namespace.c | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 ++-- kernel/time/namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- security/apparmor/lsm.c | 2 +- security/security.c | 2 +- security/selinux/hooks.c | 2 +- security/tomoyo/tomoyo.c | 2 +- 38 files changed, 59 insertions(+), 59 deletions(-) (limited to 'include/net') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9fda3906e5f5..d15918d7fabb 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -286,7 +286,7 @@ out: } EXPORT_SYMBOL_GPL(set_task_ioprio); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d338..d9c190ffa7df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4200,7 +4200,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } __latent_entropy -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, +struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..56d9556a181a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -796,7 +796,7 @@ extern struct cgroup_namespace init_cgroup_ns; void free_cgroup_ns(struct cgroup_namespace *ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); @@ -818,7 +818,7 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, +copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; diff --git a/include/linux/cred.h b/include/linux/cred.h index a102a10f833f..89ae50ad2ace 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -148,7 +148,7 @@ struct cred { extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); -extern int copy_creds(struct task_struct *, unsigned long); +extern int copy_creds(struct task_struct *, u64); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 14f7eaf1b443..079d8773790c 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -118,8 +118,8 @@ struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk); -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk); +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { if (!current->io_context) return 0; @@ -129,7 +129,7 @@ static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611a..4b399893e2b3 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,7 +129,7 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) -extern struct ipc_namespace *copy_ipcs(unsigned long flags, +extern struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) @@ -151,7 +151,7 @@ static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns extern void put_ipc_ns(struct ipc_namespace *ns); #else -static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +static inline struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index fd11fffdd3c3..adbe234a6f6c 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -211,7 +211,7 @@ LSM_HOOK(int, 0, file_open, struct file *file) LSM_HOOK(int, 0, file_post_open, struct file *file, int mask) LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b64816..ff290c87b2e7 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,7 +11,7 @@ struct fs_struct; struct user_namespace; struct ns_common; -extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, +extern struct mnt_namespace *copy_mnt_ns(u64, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T)) diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a22..82533e899ff4 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -103,7 +103,7 @@ static inline struct cred *nsset_cred(struct nsset *set) * */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk); +int copy_namespaces(u64 flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..0620a3e08e83 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -78,7 +78,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) } #endif -extern struct pid_namespace *copy_pid_ns(unsigned long flags, +extern struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); @@ -97,7 +97,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) return 0; } -static inline struct pid_namespace *copy_pid_ns(unsigned long flags, +static inline struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bc8af3eb5598..a96fd345aa38 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -65,7 +65,7 @@ static inline void rseq_migrate(struct task_struct *t) * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; @@ -107,7 +107,7 @@ static inline void rseq_preempt(struct task_struct *t) static inline void rseq_migrate(struct task_struct *t) { } -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ea41795a352b..34d6a0e108c3 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,7 @@ extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); -extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_fork(u64 clone_flags, struct task_struct *p); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b9717..9a1d4a6c8673 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -489,7 +489,7 @@ int security_file_receive(struct file *file); int security_file_open(struct file *file); int security_file_post_open(struct file *file, int mask); int security_file_truncate(struct file *file); -int security_task_alloc(struct task_struct *task, unsigned long clone_flags); +int security_task_alloc(struct task_struct *task, u64 clone_flags); void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); @@ -1215,7 +1215,7 @@ static inline int security_file_truncate(struct file *file) } static inline int security_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { return 0; } diff --git a/include/linux/sem.h b/include/linux/sem.h index c4deefe42aeb..275269ce2ec8 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -9,12 +9,12 @@ struct task_struct; #ifdef CONFIG_SYSVIPC -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern int copy_semundo(u64 clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else -static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_semundo(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc94..b6e36525e0be 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -43,7 +43,7 @@ static inline struct time_namespace *get_time_ns(struct time_namespace *ns) return ns; } -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); @@ -129,7 +129,7 @@ static inline void put_time_ns(struct time_namespace *ns) } static inline -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c39094..915303a82d84 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -205,7 +205,7 @@ extern void uprobe_start_dup_mmap(void); extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); -extern void uprobe_copy_process(struct task_struct *t, unsigned long flags); +extern void uprobe_copy_process(struct task_struct *t, u64 flags); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); @@ -281,7 +281,7 @@ static inline bool uprobe_deny_signal(void) static inline void uprobe_free_utask(struct task_struct *t) { } -static inline void uprobe_copy_process(struct task_struct *t, unsigned long flags) +static inline void uprobe_copy_process(struct task_struct *t, u64 flags) { } static inline void uprobe_clear_state(struct mm_struct *mm) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 8afa8c3a0973..57d1ff006090 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -33,7 +33,7 @@ extern void user_event_mm_dup(struct task_struct *t, extern void user_event_mm_remove(struct task_struct *t); static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { struct user_event_mm *old_mm; @@ -68,7 +68,7 @@ static inline void user_events_exit(struct task_struct *t) } #else static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { } diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412b..ba34ec0e2f95 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -35,7 +35,7 @@ static inline void get_uts_ns(struct uts_namespace *ns) refcount_inc(&ns->ns.count); } -extern struct uts_namespace *copy_utsname(unsigned long flags, +extern struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns); extern void free_uts_ns(struct uts_namespace *ns); @@ -55,7 +55,7 @@ static inline void put_uts_ns(struct uts_namespace *ns) { } -static inline struct uts_namespace *copy_utsname(unsigned long flags, +static inline struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { if (flags & CLONE_NEWUTS) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275..0e008cfe159d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -204,7 +204,7 @@ struct net { extern struct net init_net; #ifdef CONFIG_NET_NS -struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net); void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); @@ -218,7 +218,7 @@ extern struct task_struct *cleanup_net_task; #else /* CONFIG_NET_NS */ #include #include -static inline struct net *copy_net_ns(unsigned long flags, +static inline struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { if (flags & CLONE_NEWNET) diff --git a/include/trace/events/task.h b/include/trace/events/task.h index af535b053033..4f0759634306 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -8,14 +8,14 @@ TRACE_EVENT(task_newtask, - TP_PROTO(struct task_struct *task, unsigned long clone_flags), + TP_PROTO(struct task_struct *task, u64 clone_flags), TP_ARGS(task, clone_flags), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN) - __field( unsigned long, clone_flags) + __field( u64, clone_flags) __field( short, oom_score_adj) ), @@ -26,7 +26,7 @@ TRACE_EVENT(task_newtask, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", + TP_printk("pid=%d comm=%s clone_flags=%llx oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->clone_flags, __entry->oom_score_adj) ); diff --git a/ipc/namespace.c b/ipc/namespace.c index 4df91ceeeafe..a712ec27209c 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -106,7 +106,7 @@ fail: return ERR_PTR(err); } -struct ipc_namespace *copy_ipcs(unsigned long flags, +struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) diff --git a/ipc/sem.c b/ipc/sem.c index a39cdc7bf88f..0f06e4bd4673 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2303,7 +2303,7 @@ SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, * parent and child tasks. */ -int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +int copy_semundo(u64 clone_flags, struct task_struct *tsk) { struct sem_undo_list *undo_list; int error; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c6..dedadb525880 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -47,7 +47,7 @@ void free_cgroup_ns(struct cgroup_namespace *ns) } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { diff --git a/kernel/cred.c b/kernel/cred.c index 9676965c0981..dbf6b687dc5c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -287,7 +287,7 @@ struct cred *prepare_exec_creds(void) * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ -int copy_creds(struct task_struct *p, unsigned long clone_flags) +int copy_creds(struct task_struct *p, u64 clone_flags) { struct cred *new; int ret; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd..b2753014c6dd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2160,7 +2160,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t, unsigned long flags) +void uprobe_copy_process(struct task_struct *t, u64 flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; diff --git a/kernel/fork.c b/kernel/fork.c index 4e2c5a3e8989..d6e1fb11eff9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1507,7 +1507,7 @@ fail_nomem: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) +static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; @@ -1545,7 +1545,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { @@ -1566,7 +1566,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk, +static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; @@ -1645,7 +1645,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) posix_cputimers_group_init(pct, cpu_limit); } -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38..8af3b9ec3aa8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -64,7 +64,7 @@ static inline struct nsproxy *create_nsproxy(void) * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(unsigned long flags, +static struct nsproxy *create_new_namespaces(u64 flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs) { @@ -144,7 +144,7 @@ out_ns: * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) +int copy_namespaces(u64 flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..06bc7c7f78e0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -171,7 +171,7 @@ static void destroy_pid_namespace_work(struct work_struct *work) } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); } -struct pid_namespace *copy_pid_ns(unsigned long flags, +struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..6fa85d30d965 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4472,7 +4472,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * __sched_fork() is basic setup which is also used by sched_init() to * initialize the boot CPU's idle task. */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +static void __sched_fork(u64 clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -4707,7 +4707,7 @@ late_initcall(sched_core_sysctl_init); /* * fork()/clone()-time setup: */ -int sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(u64 clone_flags, struct task_struct *p) { __sched_fork(clone_flags, p); /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..af0866ce2dfc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3542,7 +3542,7 @@ out: } } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +void init_numa_balancing(u64 clone_flags, struct task_struct *p) { int mm_users = 0; struct mm_struct *mm = p->mm; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f7..f9adfc912ddc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1935,12 +1935,12 @@ extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); -extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +extern void init_numa_balancing(u64 clone_flags, struct task_struct *p); #else /* !CONFIG_NUMA_BALANCING: */ static inline void -init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +init_numa_balancing(u64 clone_flags, struct task_struct *p) { } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 667452768ed3..888872bcc5bb 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -130,7 +130,7 @@ fail: * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f2..00d8d7922f86 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -86,7 +86,7 @@ fail: * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, +struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e..8ec9d83475bf 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -539,7 +539,7 @@ void net_drop_ns(void *p) net_passive_dec(net); } -struct net *copy_net_ns(unsigned long flags, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b..ba39cfe0cd08 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -112,7 +112,7 @@ static void apparmor_task_free(struct task_struct *task) } static int apparmor_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct aa_task_ctx *new = task_ctx(task); diff --git a/security/security.c b/security/security.c index ad163f06bf7a..a769140553bc 100644 --- a/security/security.c +++ b/security/security.c @@ -3185,7 +3185,7 @@ int security_file_truncate(struct file *file) * * Return: Returns a zero on success, negative values on failure. */ -int security_task_alloc(struct task_struct *task, unsigned long clone_flags) +int security_task_alloc(struct task_struct *task, u64 clone_flags) { int rc = lsm_task_alloc(task); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..bb016dd511c1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4144,7 +4144,7 @@ static int selinux_file_open(struct file *file) /* task security operations */ static int selinux_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { u32 sid = current_sid(); diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index d6ebcd9db80a..48fc59d38ab2 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -514,7 +514,7 @@ struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = { * Returns 0. */ static int tomoyo_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct tomoyo_task *old = tomoyo_task(current); struct tomoyo_task *new = tomoyo_task(task); -- cgit v1.2.3 From 7051b54fb5aa2d0b77657aef7c272471b36c0327 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 29 Aug 2025 21:56:38 +0000 Subject: tcp: Remove sk->sk_prot->orphan_count. TCP tracks the number of orphaned (SOCK_DEAD but not yet destructed) sockets in tcp_orphan_count. In some code that was shared with DCCP, tcp_orphan_count is referenced via sk->sk_prot->orphan_count. Let's reference tcp_orphan_count directly. inet_csk_prepare_for_destroy_sock() is moved to inet_connection_sock.c due to header dependency. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250829215641.711664-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 4 ++-- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h | 1 - include/net/inet_connection_sock.h | 8 +------- include/net/sock.h | 2 -- include/net/tcp.h | 10 ++++++++++ net/ipv4/inet_connection_sock.c | 11 +++++++++-- net/ipv4/inet_hashtables.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_ipv4.c | 1 - net/ipv6/tcp_ipv6.c | 1 - 10 files changed, 24 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 000116e47e38..4ee970f3bad6 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -505,7 +505,7 @@ static void reset_listen_child(struct sock *child) chtls_send_reset(child, CPL_ABORT_SEND_RST, skb); sock_orphan(child); - INC_ORPHAN_COUNT(child); + tcp_orphan_count_inc(); if (child->sk_state == TCP_CLOSE) inet_csk_destroy_sock(child); } @@ -870,7 +870,7 @@ static void do_abort_syn_rcv(struct sock *child, struct sock *parent) * created only after 3 way handshake is done. */ sock_orphan(child); - INC_ORPHAN_COUNT(child); + tcp_orphan_count_inc(); chtls_release_resources(child); chtls_conn_done(child); } else { diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h index 667effc2a23c..29ceff5a5fcb 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h @@ -95,7 +95,6 @@ struct deferred_skb_cb { #define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok) #define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok) #define SACK_OK(tp) ((tp)->rx_opt.sack_ok) -#define INC_ORPHAN_COUNT(sk) this_cpu_inc(*(sk)->sk_prot->orphan_count) /* TLS SKB */ #define skb_ulp_tls_inline(skb) (ULP_SKB_CB(skb)->ulp.tls.ofld) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 1735db332aab..0737d8e178dd 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -299,14 +299,8 @@ reqsk_timeout(struct request_sock *req, unsigned long max_timeout) return (unsigned long)min_t(u64, timeout, max_timeout); } -static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) -{ - /* The below has to be done to allow calling inet_csk_destroy_sock */ - sock_set_flag(sk, SOCK_DEAD); - this_cpu_inc(*sk->sk_prot->orphan_count); -} - void inet_csk_destroy_sock(struct sock *sk); +void inet_csk_prepare_for_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); /* diff --git a/include/net/sock.h b/include/net/sock.h index 73cd3316e288..1e7f124871d2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1353,8 +1353,6 @@ struct proto { unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ - unsigned int __percpu *orphan_count; - struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; diff --git a/include/net/tcp.h b/include/net/tcp.h index 16dc9cebb9d2..0fb7923b8367 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -54,6 +54,16 @@ extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); +static inline void tcp_orphan_count_inc(void) +{ + this_cpu_inc(tcp_orphan_count); +} + +static inline void tcp_orphan_count_dec(void) +{ + this_cpu_dec(tcp_orphan_count); +} + DECLARE_PER_CPU(u32, tcp_tw_isn); void tcp_time_wait(struct sock *sk, int state, int timeo); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0ef1eacd539d..142ff8d86fc2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1296,12 +1296,19 @@ void inet_csk_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); - this_cpu_dec(*sk->sk_prot->orphan_count); + tcp_orphan_count_dec(); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); +void inet_csk_prepare_for_destroy_sock(struct sock *sk) +{ + /* The below has to be done to allow calling inet_csk_destroy_sock */ + sock_set_flag(sk, SOCK_DEAD); + tcp_orphan_count_inc(); +} + /* This function allows to force a closure of a socket after the call to * tcp_create_openreq_child(). */ @@ -1369,7 +1376,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, sock_orphan(child); - this_cpu_inc(*sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4bc2b1921d2b..ef4ccfd46ff6 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -707,7 +707,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { - this_cpu_inc(*sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9bc8317e92b7..40b774b4f587 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3195,7 +3195,7 @@ adjudge_to_death: /* remove backlog if any, without releasing ownership. */ __release_sock(sk); - this_cpu_inc(tcp_orphan_count); + tcp_orphan_count_inc(); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7c1d612afca1..1e58a8a9ff7a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3517,7 +3517,6 @@ struct proto tcp_prot = { .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, - .orphan_count = &tcp_orphan_count, .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b4e56b877273..07ba32156770 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2353,7 +2353,6 @@ struct proto tcpv6_prot = { .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, - .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), -- cgit v1.2.3 From 10343e7e6c7c6558217b56fb44a538ad04752adb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Aug 2025 15:30:52 +0000 Subject: inet: ping: remove ping_hash() There is no point in keeping ping_hash(). Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Yue Haibing Link: https://patch.msgid.link/20250829153054.474201-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ping.h | 1 - net/ipv4/ping.c | 10 ---------- net/ipv6/ping.c | 1 - 3 files changed, 12 deletions(-) (limited to 'include/net') diff --git a/include/net/ping.h b/include/net/ping.h index bc7779262e60..9634b8800814 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -54,7 +54,6 @@ struct pingfakehdr { }; int ping_get_port(struct sock *sk, unsigned short ident); -int ping_hash(struct sock *sk); void ping_unhash(struct sock *sk); int ping_init_sock(struct sock *sk); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 74a0beddfcc4..75e1b0f5c697 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -67,7 +67,6 @@ static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) pr_debug("hash(%u) = %u\n", num, res); return res; } -EXPORT_SYMBOL_GPL(ping_hash); static inline struct hlist_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) @@ -144,14 +143,6 @@ fail: } EXPORT_SYMBOL_GPL(ping_get_port); -int ping_hash(struct sock *sk) -{ - pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); - BUG(); /* "Please do not press this button again." */ - - return 0; -} - void ping_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); @@ -1008,7 +999,6 @@ struct proto ping_prot = { .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 82b0492923d4..d7a2cdaa2631 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -208,7 +208,6 @@ struct proto pingv6_prot = { .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, -- cgit v1.2.3 From 689adb36bd433b24390080606a07d664cca2982e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Aug 2025 15:30:53 +0000 Subject: inet: ping: make ping_port_rover per netns Provide isolation between netns for ping idents. Randomize initial ping_port_rover value at netns creation. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250829153054.474201-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + net/ipv4/ping.c | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 6373e3f17da8..54a7d187f62a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -251,6 +251,7 @@ struct netns_ipv4 { int sysctl_igmp_qrv; struct ping_group_range ping_group_range; + u16 ping_port_rover; atomic_t dev_addr_genid; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 75e1b0f5c697..98ccd4f9ed65 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -58,8 +58,6 @@ static struct ping_table ping_table; struct pingv6_ops pingv6_ops; EXPORT_SYMBOL_GPL(pingv6_ops); -static u16 ping_port_rover; - static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { u32 res = (num + net_hash_mix(net)) & mask; @@ -84,12 +82,12 @@ int ping_get_port(struct sock *sk, unsigned short ident) isk = inet_sk(sk); spin_lock(&ping_table.lock); if (ident == 0) { + u16 result = net->ipv4.ping_port_rover + 1; u32 i; - u16 result = ping_port_rover + 1; for (i = 0; i < (1L << 16); i++, result++) { if (!result) - result++; /* avoid zero */ + continue; /* avoid zero */ hlist = ping_hashslot(&ping_table, net, result); sk_for_each(sk2, hlist) { if (!net_eq(sock_net(sk2), net)) @@ -101,7 +99,7 @@ int ping_get_port(struct sock *sk, unsigned short ident) } /* found */ - ping_port_rover = ident = result; + net->ipv4.ping_port_rover = ident = result; break; next_port: ; @@ -1146,6 +1144,8 @@ static int __net_init ping_v4_proc_init_net(struct net *net) if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; + + net->ipv4.ping_port_rover = get_random_u16(); return 0; } -- cgit v1.2.3 From a60a5abe19d6acd9d9ea4c1883745399fb5dc023 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 Aug 2025 10:15:37 +0200 Subject: netfilter: nf_tables: allow iter callbacks to sleep Quoting Sven Auhagen: we do see on occasions that we get the following error message, more so on x86 systems than on arm64: Error: Could not process rule: Cannot allocate memory delete table inet filter It is not a consistent error and does not happen all the time. We are on Kernel 6.6.80, seems to me like we have something along the lines of the nf_tables: allow clone callbacks to sleep problem using GFP_ATOMIC. As hinted at by Sven, this is because of GFP_ATOMIC allocations during set flush. When set is flushed, all elements are deactivated. This triggers a set walk and each element gets added to the transaction list. The rbtree and rhashtable sets don't allow the iter callback to sleep: rbtree walk acquires read side of an rwlock with bh disabled, rhashtable walk happens with rcu read lock held. Rbtree is simple enough to resolve: When the walk context is ITER_READ, no change is needed (the iter callback must not deactivate elements; we're not in a transaction). When the iter type is ITER_UPDATE, the rwlock isn't needed because the caller holds the transaction mutex, this prevents any and all changes to the ruleset, including add/remove of set elements. Rhashtable is slightly more complex. When the iter type is ITER_READ, no change is needed, like rbtree. For ITER_UPDATE, we hold transaction mutex which prevents elements from getting free'd, even outside of rcu read lock section. So build a temporary list of all elements while doing the rcu iteration and then call the iterator in a second pass. The disadvantage is the need to iterate twice, but this cost comes with the benefit to allow the iter callback to use GFP_KERNEL allocations in a followup patch. The new list based logic makes it necessary to catch recursive calls to the same set earlier. Such walk -> iter -> walk recursion for the same set can happen during ruleset validation in case userspace gave us a bogus (cyclic) ruleset where verdict map m jumps to chain that sooner or later also calls "vmap @m". Before the new ->in_update_walk test, the ruleset is rejected because the infinite recursion causes ctx->level to exceed the allowed maximum. But with the new logic added here, elements would get skipped: nft_rhash_walk_update would see elements that are on the walk_list of an older stack frame. As all recursive calls into same map results in -EMLINK, we can avoid this problem by using the new in_update_walk flag and reject immediately. Next patch converts the problematic GFP_ATOMIC allocations. Reported-by: Sven Auhagen Closes: https://lore.kernel.org/netfilter-devel/BY1PR18MB5874110CAFF1ED098D0BC4E7E07BA@BY1PR18MB5874.namprd18.prod.outlook.com/ Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 2 + net/netfilter/nft_set_hash.c | 100 ++++++++++++++++++++++++++++++++++++-- net/netfilter/nft_set_rbtree.c | 35 ++++++++++--- 3 files changed, 126 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 891e43a01bdc..e2128663b160 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -556,6 +556,7 @@ struct nft_set_elem_expr { * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element + * @in_update_walk: true during ->walk() in transaction phase * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal @@ -590,6 +591,7 @@ struct nft_set { u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; + bool in_update_walk; u32 use; atomic_t nelems; u32 ndeact; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 266d0c637225..ba01ce75d6de 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -30,6 +30,7 @@ struct nft_rhash { struct nft_rhash_elem { struct nft_elem_priv priv; struct rhash_head node; + struct llist_node walk_node; u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -144,6 +145,7 @@ nft_rhash_update(struct nft_set *set, const u32 *key, goto err1; he = nft_elem_priv_cast(elem_priv); + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -180,6 +182,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, }; struct nft_rhash_elem *prev; + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -261,12 +264,12 @@ static bool nft_rhash_delete(const struct nft_set *set, return true; } -static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he; struct rhashtable_iter hti; + struct nft_rhash_elem *he; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -295,6 +298,97 @@ cont: rhashtable_walk_exit(&hti); } +static void nft_rhash_walk_update(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he, *tmp; + struct llist_node *first_node; + struct rhashtable_iter hti; + LLIST_HEAD(walk_list); + + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + + if (set->in_update_walk) { + /* This can happen with bogus rulesets during ruleset validation + * when a verdict map causes a jump back to the same map. + * + * Without this extra check the walk_next loop below will see + * elems on the callers walk_list and skip (not validate) them. + */ + iter->err = -EMLINK; + return; + } + + /* walk happens under RCU. + * + * We create a snapshot list so ->iter callback can sleep. + * commit_mutex is held, elements can ... + * .. be added in parallel from dataplane (dynset) + * .. be marked as dead in parallel from dataplane (dynset). + * .. be queued for removal in parallel (gc timeout). + * .. not be freed: transaction mutex is held. + */ + rhashtable_walk_enter(&priv->ht, &hti); + rhashtable_walk_start(&hti); + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + if (PTR_ERR(he) != -EAGAIN) { + iter->err = PTR_ERR(he); + break; + } + + continue; + } + + /* rhashtable resized during walk, skip */ + if (llist_on_list(&he->walk_node)) + continue; + + llist_add(&he->walk_node, &walk_list); + } + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + first_node = __llist_del_all(&walk_list); + set->in_update_walk = true; + llist_for_each_entry_safe(he, tmp, first_node, walk_node) { + if (iter->err == 0) { + iter->err = iter->fn(ctx, set, iter, &he->priv); + if (iter->err == 0) + iter->count++; + } + + /* all entries must be cleared again, else next ->walk iteration + * will skip entries. + */ + init_llist_node(&he->walk_node); + } + set->in_update_walk = false; +} + +static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + switch (iter->type) { + case NFT_ITER_UPDATE: + /* only relevant for netlink dumps which use READ type */ + WARN_ON_ONCE(iter->skip != 0); + + nft_rhash_walk_update(ctx, set, iter); + break; + case NFT_ITER_READ: + nft_rhash_walk_ro(ctx, set, iter); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } +} + static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, struct nft_set_ext *ext) { diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 938a257c069e..b311b66df3e9 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -584,15 +584,14 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, return NULL; } -static void nft_rbtree_walk(const struct nft_ctx *ctx, - struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rbtree_do_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; - read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -600,14 +599,34 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); - if (iter->err < 0) { - read_unlock_bh(&priv->lock); + if (iter->err < 0) return; - } cont: iter->count++; } - read_unlock_bh(&priv->lock); +} + +static void nft_rbtree_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + switch (iter->type) { + case NFT_ITER_UPDATE: + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); + break; + case NFT_ITER_READ: + read_lock_bh(&priv->lock); + nft_rbtree_do_walk(ctx, set, iter); + read_unlock_bh(&priv->lock); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, -- cgit v1.2.3 From f4f9e05904e11bbc772c031b35d0d25caa21d5e8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Aug 2025 20:43:47 +0200 Subject: netfilter: nf_reject: remove unneeded exports These functions have no external callers and can be static. Signed-off-by: Florian Westphal --- include/net/netfilter/ipv4/nf_reject.h | 8 -------- include/net/netfilter/ipv6/nf_reject.h | 10 --------- net/ipv4/netfilter/nf_reject_ipv4.c | 27 ++++++++++++++++--------- net/ipv6/netfilter/nf_reject_ipv6.c | 37 +++++++++++++++++++++++----------- 4 files changed, 42 insertions(+), 40 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index c653fcb88354..09de2f2686b5 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -10,14 +10,6 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook); void nf_send_reset(struct net *net, struct sock *, struct sk_buff *oldskb, int hook); -const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *_oth, int hook); -struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int ttl); -void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - const struct tcphdr *oth); - struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index d729344ba644..94ec0b9f2838 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -9,16 +9,6 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char cod unsigned int hooknum); void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen); - struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 0d3cb2ba6fc8..05631abe3f0d 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -12,6 +12,15 @@ #include #include +static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int ttl); +static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth); +static const struct tcphdr * +nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook); + static int nf_reject_iphdr_validate(struct sk_buff *skb) { struct iphdr *iph; @@ -136,8 +145,9 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach); -const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *_oth, int hook) +static const struct tcphdr * +nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook) { const struct tcphdr *oth; @@ -163,11 +173,10 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, return oth; } -EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); -struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int ttl) +static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int ttl) { struct iphdr *niph, *oiph = ip_hdr(oldskb); @@ -188,10 +197,9 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, return niph; } -EXPORT_SYMBOL_GPL(nf_reject_iphdr_put); -void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - const struct tcphdr *oth) +static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth) { struct iphdr *niph = ip_hdr(nskb); struct tcphdr *tcph; @@ -218,7 +226,6 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); } -EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) { diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index cb2d38e80de9..6b022449f867 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -12,6 +12,19 @@ #include #include +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit); +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen); +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook); + static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -146,9 +159,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook) +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook) { const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); u8 proto; @@ -192,11 +206,11 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, return otcph; } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit) +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit) { struct ipv6hdr *ip6h; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); @@ -216,11 +230,11 @@ struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, return ip6h; } -EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen) +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen) { struct tcphdr *tcph; @@ -248,7 +262,6 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, csum_partial(tcph, sizeof(struct tcphdr), 0)); } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) { -- cgit v1.2.3 From 077dc4a275790b09e8a2ce80822ba8970e9dfb99 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Thu, 28 Aug 2025 14:48:31 +0200 Subject: netfilter: nft_payload: extend offset to 65535 bytes In some situations 255 bytes offset is not enough to match or manipulate the desired packet field. Increase the offset limit to 65535 or U16_MAX. In addition, the nla policy maximum value is not set anymore as it is limited to s16. Instead, the maximum value is checked during the payload expression initialization function. Tested with the nft command line tool. table ip filter { chain output { @nh,2040,8 set 0xff @nh,524280,8 set 0xff @nh,524280,8 0xff @nh,2040,8 0xff } } Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables_core.h | 2 +- net/netfilter/nft_payload.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 6c2f483d9828..7644cfe9267d 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -73,7 +73,7 @@ struct nft_ct { struct nft_payload { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 dreg; }; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 059b28ffad0e..b0214418f75a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -40,7 +40,7 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, /* add vlan header into the user buffer for if tag was removed by offloads */ static bool -nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; @@ -212,7 +212,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 }, [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), @@ -797,7 +797,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, struct nft_payload_set { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 sreg; u8 csum_type; @@ -812,7 +812,7 @@ struct nft_payload_vlan_hdr { }; static bool -nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len, +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len, int *vlan_hlen) { struct nft_payload_vlan_hdr *vlanh; @@ -940,14 +940,18 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE; struct nft_payload_set *priv = nft_expr_priv(expr); - u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE; int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); - priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); + if (err < 0) + return err; + priv->offset = offset; + if (tb[NFTA_PAYLOAD_CSUM_TYPE]) csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { @@ -1069,7 +1073,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); if (err < 0) return ERR_PTR(err); -- cgit v1.2.3 From 23a6037ce76cb44d93cfea23aec5c7f3971227d4 Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Fri, 29 Aug 2025 17:08:37 -0700 Subject: bonding: Remove support for use_carrier Remove the implementation of use_carrier, the link monitoring method that utilizes ethtool or ioctl to determine the link state of an interface in a bond. Bonding will always behaves as if use_carrier=1, which relies on netif_carrier_ok() to determine the link state of interfaces. To avoid acquiring RTNL many times per second, bonding inspects link state under RCU, but not under RTNL. However, ethtool implementations in drivers may sleep, and therefore this strategy is unsuitable for use with calls into driver ethtool functions. The use_carrier option was introduced in 2003, to provide backwards compatibility for network device drivers that did not support the then-new netif_carrier_ok/on/off system. Device drivers are now expected to support netif_carrier_*, and the use_carrier backwards compatibility logic is no longer necessary. The option itself remains, but when queried always returns 1, and may only be set to 1. Link: https://lore.kernel.org/000000000000eb54bf061cfd666a@google.com Link: https://lore.kernel.org/20240718122017.d2e33aaac43a.I10ab9c9ded97163aef4e4de10985cd8f7de60d28@changeid Signed-off-by: Jay Vosburgh Reported-by: syzbot+b8c48ea38ca27d150063@syzkaller.appspotmail.com Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/2029487.1756512517@famine Signed-off-by: Jakub Kicinski --- Documentation/networking/bonding.rst | 79 ++++-------------------- drivers/net/bonding/bond_main.c | 113 +++-------------------------------- drivers/net/bonding/bond_netlink.c | 14 ++--- drivers/net/bonding/bond_options.c | 7 +-- drivers/net/bonding/bond_sysfs.c | 6 +- include/net/bonding.h | 1 - 6 files changed, 28 insertions(+), 192 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index f8f5766703d4..a2b42ae719d2 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -582,10 +582,8 @@ miimon This determines how often the link state of each slave is inspected for link failures. A value of zero disables MII link monitoring. A value of 100 is a good starting point. - The use_carrier option, below, affects how the link state is - determined. See the High Availability section for additional - information. The default value is 100 if arp_interval is not - set. + + The default value is 100 if arp_interval is not set. min_links @@ -896,25 +894,14 @@ updelay use_carrier - Specifies whether or not miimon should use MII or ETHTOOL - ioctls vs. netif_carrier_ok() to determine the link - status. The MII or ETHTOOL ioctls are less efficient and - utilize a deprecated calling sequence within the kernel. The - netif_carrier_ok() relies on the device driver to maintain its - state with netif_carrier_on/off; at this writing, most, but - not all, device drivers support this facility. - - If bonding insists that the link is up when it should not be, - it may be that your network device driver does not support - netif_carrier_on/off. The default state for netif_carrier is - "carrier on," so if a driver does not support netif_carrier, - it will appear as if the link is always up. In this case, - setting use_carrier to 0 will cause bonding to revert to the - MII / ETHTOOL ioctl method to determine the link state. - - A value of 1 enables the use of netif_carrier_ok(), a value of - 0 will use the deprecated MII / ETHTOOL ioctls. The default - value is 1. + Obsolete option that previously selected between MII / + ETHTOOL ioctls and netif_carrier_ok() to determine link + state. + + All link state checks are now done with netif_carrier_ok(). + + For backwards compatibility, this option's value may be inspected + or set. The only valid setting is 1. xmit_hash_policy @@ -2036,22 +2023,8 @@ depending upon the device driver to maintain its carrier state, by querying the device's MII registers, or by making an ethtool query to the device. -If the use_carrier module parameter is 1 (the default value), -then the MII monitor will rely on the driver for carrier state -information (via the netif_carrier subsystem). As explained in the -use_carrier parameter information, above, if the MII monitor fails to -detect carrier loss on the device (e.g., when the cable is physically -disconnected), it may be that the driver does not support -netif_carrier. - -If use_carrier is 0, then the MII monitor will first query the -device's (via ioctl) MII registers and check the link state. If that -request fails (not just that it returns carrier down), then the MII -monitor will make an ethtool ETHTOOL_GLINK request to attempt to obtain -the same information. If both methods fail (i.e., the driver either -does not support or had some error in processing both the MII register -and ethtool requests), then the MII monitor will assume the link is -up. +The MII monitor relies on the driver for carrier state information (via +the netif_carrier subsystem). 8. Potential Sources of Trouble =============================== @@ -2135,34 +2108,6 @@ This will load tg3 and e1000 modules before loading the bonding one. Full documentation on this can be found in the modprobe.d and modprobe manual pages. -8.3. Painfully Slow Or No Failed Link Detection By Miimon ---------------------------------------------------------- - -By default, bonding enables the use_carrier option, which -instructs bonding to trust the driver to maintain carrier state. - -As discussed in the options section, above, some drivers do -not support the netif_carrier_on/_off link state tracking system. -With use_carrier enabled, bonding will always see these links as up, -regardless of their actual state. - -Additionally, other drivers do support netif_carrier, but do -not maintain it in real time, e.g., only polling the link state at -some fixed interval. In this case, miimon will detect failures, but -only after some long period of time has expired. If it appears that -miimon is very slow in detecting link failures, try specifying -use_carrier=0 to see if that improves the failure detection time. If -it does, then it may be that the driver checks the carrier state at a -fixed interval, but does not cache the MII register values (so the -use_carrier=0 method of querying the registers directly works). If -use_carrier=0 does not improve the failover, then the driver may cache -the registers, or the problem may be elsewhere. - -Also, remember that miimon only checks for the device's -carrier state. It has no way to determine the state of devices on or -beyond other ports of a switch, or if a switch is refusing to pass -traffic while still maintaining carrier on. - 9. SNMP agents =============== diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 257333c88710..f25c2d2c9181 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -142,8 +142,7 @@ module_param(downdelay, int, 0); MODULE_PARM_DESC(downdelay, "Delay before considering link down, " "in milliseconds"); module_param(use_carrier, int, 0); -MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; " - "0 for off, 1 for on (default)"); +MODULE_PARM_DESC(use_carrier, "option obsolete, use_carrier cannot be disabled"); module_param(mode, charp, 0); MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, " "1 for active-backup, 2 for balance-xor, " @@ -830,77 +829,6 @@ const char *bond_slave_link_status(s8 link) } } -/* if supports MII link status reporting, check its link status. - * - * We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(), - * depending upon the setting of the use_carrier parameter. - * - * Return either BMSR_LSTATUS, meaning that the link is up (or we - * can't tell and just pretend it is), or 0, meaning that the link is - * down. - * - * If reporting is non-zero, instead of faking link up, return -1 if - * both ETHTOOL and MII ioctls fail (meaning the device does not - * support them). If use_carrier is set, return whatever it says. - * It'd be nice if there was a good way to tell if a driver supports - * netif_carrier, but there really isn't. - */ -static int bond_check_dev_link(struct bonding *bond, - struct net_device *slave_dev, int reporting) -{ - const struct net_device_ops *slave_ops = slave_dev->netdev_ops; - struct mii_ioctl_data *mii; - struct ifreq ifr; - int ret; - - if (!reporting && !netif_running(slave_dev)) - return 0; - - if (bond->params.use_carrier) - return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0; - - /* Try to get link status using Ethtool first. */ - if (slave_dev->ethtool_ops->get_link) { - netdev_lock_ops(slave_dev); - ret = slave_dev->ethtool_ops->get_link(slave_dev); - netdev_unlock_ops(slave_dev); - - return ret ? BMSR_LSTATUS : 0; - } - - /* Ethtool can't be used, fallback to MII ioctls. */ - if (slave_ops->ndo_eth_ioctl) { - /* TODO: set pointer to correct ioctl on a per team member - * bases to make this more efficient. that is, once - * we determine the correct ioctl, we will always - * call it and not the others for that team - * member. - */ - - /* We cannot assume that SIOCGMIIPHY will also read a - * register; not all network drivers (e.g., e100) - * support that. - */ - - /* Yes, the mii is overlaid on the ifreq.ifr_ifru */ - strscpy_pad(ifr.ifr_name, slave_dev->name, IFNAMSIZ); - mii = if_mii(&ifr); - - if (dev_eth_ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) { - mii->reg_num = MII_BMSR; - if (dev_eth_ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0) - return mii->val_out & BMSR_LSTATUS; - } - } - - /* If reporting, report that either there's no ndo_eth_ioctl, - * or both SIOCGMIIREG and get_link failed (meaning that we - * cannot report link status). If not reporting, pretend - * we're ok. - */ - return reporting ? -1 : BMSR_LSTATUS; -} - /*----------------------------- Multicast list ------------------------------*/ /* Push the promiscuity flag down to appropriate slaves */ @@ -1966,7 +1894,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, const struct net_device_ops *slave_ops = slave_dev->netdev_ops; struct slave *new_slave = NULL, *prev_slave; struct sockaddr_storage ss; - int link_reporting; int res = 0, i; if (slave_dev->flags & IFF_MASTER && @@ -1976,12 +1903,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, return -EPERM; } - if (!bond->params.use_carrier && - slave_dev->ethtool_ops->get_link == NULL && - slave_ops->ndo_eth_ioctl == NULL) { - slave_warn(bond_dev, slave_dev, "no link monitoring support\n"); - } - /* already in-use? */ if (netdev_is_rx_handler_busy(slave_dev)) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, @@ -2195,29 +2116,10 @@ skip_mac_set: new_slave->last_tx = new_slave->last_rx; - if (bond->params.miimon && !bond->params.use_carrier) { - link_reporting = bond_check_dev_link(bond, slave_dev, 1); - - if ((link_reporting == -1) && !bond->params.arp_interval) { - /* miimon is set but a bonded network driver - * does not support ETHTOOL/MII and - * arp_interval is not set. Note: if - * use_carrier is enabled, we will never go - * here (because netif_carrier is always - * supported); thus, we don't need to change - * the messages for netif_carrier. - */ - slave_warn(bond_dev, slave_dev, "MII and ETHTOOL support not available for slave, and arp_interval/arp_ip_target module parameters not specified, thus bonding will not detect link failures! see bonding.txt for details\n"); - } else if (link_reporting == -1) { - /* unable get link status using mii/ethtool */ - slave_warn(bond_dev, slave_dev, "can't get link status from slave; the network driver associated with this interface does not support MII or ETHTOOL link status reporting, thus miimon has no effect on this interface\n"); - } - } - /* check for initial state */ new_slave->link = BOND_LINK_NOCHANGE; if (bond->params.miimon) { - if (bond_check_dev_link(bond, slave_dev, 0) == BMSR_LSTATUS) { + if (netif_carrier_ok(slave_dev)) { if (bond->params.updelay) { bond_set_slave_link_state(new_slave, BOND_LINK_BACK, @@ -2759,7 +2661,7 @@ static int bond_miimon_inspect(struct bonding *bond) bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); - link_state = bond_check_dev_link(bond, slave->dev, 0); + link_state = netif_carrier_ok(slave->dev); switch (slave->link) { case BOND_LINK_UP: @@ -6257,10 +6159,10 @@ static int __init bond_check_params(struct bond_params *params) downdelay = 0; } - if ((use_carrier != 0) && (use_carrier != 1)) { - pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n", - use_carrier); - use_carrier = 1; + if (use_carrier != 1) { + pr_err("Error: invalid use_carrier parameter (%d)\n", + use_carrier); + return -EINVAL; } if (num_peer_notif < 0 || num_peer_notif > 255) { @@ -6507,7 +6409,6 @@ static int __init bond_check_params(struct bond_params *params) params->updelay = updelay; params->downdelay = downdelay; params->peer_notif_delay = 0; - params->use_carrier = use_carrier; params->lacp_active = 1; params->lacp_fast = lacp_fast; params->primary[0] = 0; diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 57fff2421f1b..e573b34a1bbc 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -259,13 +259,11 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[], return err; } if (data[IFLA_BOND_USE_CARRIER]) { - int use_carrier = nla_get_u8(data[IFLA_BOND_USE_CARRIER]); - - bond_opt_initval(&newval, use_carrier); - err = __bond_opt_set(bond, BOND_OPT_USE_CARRIER, &newval, - data[IFLA_BOND_USE_CARRIER], extack); - if (err) - return err; + if (nla_get_u8(data[IFLA_BOND_USE_CARRIER]) != 1) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_BOND_USE_CARRIER], + "option obsolete, use_carrier cannot be disabled"); + return -EINVAL; + } } if (data[IFLA_BOND_ARP_INTERVAL]) { int arp_interval = nla_get_u32(data[IFLA_BOND_ARP_INTERVAL]); @@ -688,7 +686,7 @@ static int bond_fill_info(struct sk_buff *skb, bond->params.peer_notif_delay * bond->params.miimon)) goto nla_put_failure; - if (nla_put_u8(skb, IFLA_BOND_USE_CARRIER, bond->params.use_carrier)) + if (nla_put_u8(skb, IFLA_BOND_USE_CARRIER, 1)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_ARP_INTERVAL, bond->params.arp_interval)) diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 3b6f815c55ff..c0a5eb8766b5 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -187,7 +187,6 @@ static const struct bond_opt_value bond_primary_reselect_tbl[] = { }; static const struct bond_opt_value bond_use_carrier_tbl[] = { - { "off", 0, 0}, { "on", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; @@ -419,7 +418,7 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { [BOND_OPT_USE_CARRIER] = { .id = BOND_OPT_USE_CARRIER, .name = "use_carrier", - .desc = "Use netif_carrier_ok (vs MII ioctls) in miimon", + .desc = "option obsolete, use_carrier cannot be disabled", .values = bond_use_carrier_tbl, .set = bond_option_use_carrier_set }, @@ -1091,10 +1090,6 @@ static int bond_option_peer_notif_delay_set(struct bonding *bond, static int bond_option_use_carrier_set(struct bonding *bond, const struct bond_opt_value *newval) { - netdev_dbg(bond->dev, "Setting use_carrier to %llu\n", - newval->value); - bond->params.use_carrier = newval->value; - return 0; } diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index 1e13bb170515..9a75ad3181ab 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -467,14 +467,12 @@ static ssize_t bonding_show_primary_reselect(struct device *d, static DEVICE_ATTR(primary_reselect, 0644, bonding_show_primary_reselect, bonding_sysfs_store_option); -/* Show the use_carrier flag. */ +/* use_carrier is obsolete, but print value for compatibility */ static ssize_t bonding_show_carrier(struct device *d, struct device_attribute *attr, char *buf) { - struct bonding *bond = to_bond(d); - - return sysfs_emit(buf, "%d\n", bond->params.use_carrier); + return sysfs_emit(buf, "1\n"); } static DEVICE_ATTR(use_carrier, 0644, bonding_show_carrier, bonding_sysfs_store_option); diff --git a/include/net/bonding.h b/include/net/bonding.h index e06f0d63b2c1..37335f62f579 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -126,7 +126,6 @@ struct bond_params { int arp_interval; int arp_validate; int arp_all_targets; - int use_carrier; int fail_over_mac; int updelay; int downdelay; -- cgit v1.2.3 From 5d14bbf9d1d90cb7ca3e46fe2c8a4277572eab94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Sep 2025 09:31:41 +0000 Subject: net_sched: act: remove tcfa_qstats tcfa_qstats is currently only used to hold drops and overlimits counters. tcf_action_inc_drop_qstats() and tcf_action_inc_overlimit_qstats() currently acquire a->tcfa_lock to increment these counters. Switch to two atomic_t to get lock-free accounting. Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20250901093141.2093176-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/act_api.h | 14 ++++++-------- net/sched/act_api.c | 12 ++++++++---- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 2894cfff2da3..91a24b5e0b93 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -33,7 +33,10 @@ struct tc_action { struct tcf_t tcfa_tm; struct gnet_stats_basic_sync tcfa_bstats; struct gnet_stats_basic_sync tcfa_bstats_hw; - struct gnet_stats_queue tcfa_qstats; + + atomic_t tcfa_drops; + atomic_t tcfa_overlimits; + struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; struct gnet_stats_basic_sync __percpu *cpu_bstats; @@ -53,7 +56,6 @@ struct tc_action { #define tcf_action common.tcfa_action #define tcf_tm common.tcfa_tm #define tcf_bstats common.tcfa_bstats -#define tcf_qstats common.tcfa_qstats #define tcf_rate_est common.tcfa_rate_est #define tcf_lock common.tcfa_lock @@ -241,9 +243,7 @@ static inline void tcf_action_inc_drop_qstats(struct tc_action *a) qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); return; } - spin_lock(&a->tcfa_lock); - qstats_drop_inc(&a->tcfa_qstats); - spin_unlock(&a->tcfa_lock); + atomic_inc(&a->tcfa_drops); } static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) @@ -252,9 +252,7 @@ static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); return; } - spin_lock(&a->tcfa_lock); - qstats_overlimit_inc(&a->tcfa_qstats); - spin_unlock(&a->tcfa_lock); + atomic_inc(&a->tcfa_overlimits); } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9e468e463467..ff6be5cfe2b0 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1585,7 +1585,7 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, } _bstats_update(&a->tcfa_bstats, bytes, packets); - a->tcfa_qstats.drops += drops; + atomic_add(drops, &a->tcfa_drops); if (hw) _bstats_update(&a->tcfa_bstats_hw, bytes, packets); } @@ -1594,8 +1594,9 @@ EXPORT_SYMBOL(tcf_action_update_stats); int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { - int err = 0; + struct gnet_stats_queue qstats = {0}; struct gnet_dump d; + int err = 0; if (p == NULL) goto errout; @@ -1619,14 +1620,17 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, if (err < 0) goto errout; + qstats.drops = atomic_read(&p->tcfa_drops); + qstats.overlimits = atomic_read(&p->tcfa_overlimits); + if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfa_bstats, false) < 0 || gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, - &p->tcfa_qstats, - p->tcfa_qstats.qlen) < 0) + &qstats, + qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) -- cgit v1.2.3 From 929324913e0caabea91b50fa71e41d70b766f7dc Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 2 Sep 2025 23:11:35 +0200 Subject: net: Add rfs_needed() helper Add a helper to check if RFS is needed or not. Allows to make the code a bit cleaner and the next patch to have MPTCP use this helper to decide whether or not to iterate over the subflows. tun_flow_update() was calling sock_rps_record_flow_hash() regardless of the state of rfs_needed. This was not really a bug as sock_flow_table simply ends up being NULL and thus everything will be fine. This commit here thus also implicitly makes tun_flow_update() respect the state of rfs_needed. Suggested-by: Matthieu Baerts Signed-off-by: Christoph Paasch Acked-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250902-net-next-mptcp-misc-feat-6-18-v2-3-fa02bb3188b1@kernel.org Signed-off-by: Jakub Kicinski --- include/net/rps.h | 85 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 29 deletions(-) (limited to 'include/net') diff --git a/include/net/rps.h b/include/net/rps.h index 9917dce42ca4..f1794cd2e7fb 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -85,11 +85,8 @@ static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, WRITE_ONCE(table->ents[index], val); } -#endif /* CONFIG_RPS */ - -static inline void sock_rps_record_flow_hash(__u32 hash) +static inline void _sock_rps_record_flow_hash(__u32 hash) { -#ifdef CONFIG_RPS struct rps_sock_flow_table *sock_flow_table; if (!hash) @@ -99,42 +96,33 @@ static inline void sock_rps_record_flow_hash(__u32 hash) if (sock_flow_table) rps_record_sock_flow(sock_flow_table, hash); rcu_read_unlock(); -#endif } -static inline void sock_rps_record_flow(const struct sock *sk) +static inline void _sock_rps_record_flow(const struct sock *sk) { -#ifdef CONFIG_RPS - if (static_branch_unlikely(&rfs_needed)) { - /* Reading sk->sk_rxhash might incur an expensive cache line - * miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) { + /* This READ_ONCE() is paired with the WRITE_ONCE() + * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). */ - if (sk->sk_state == TCP_ESTABLISHED) { - /* This READ_ONCE() is paired with the WRITE_ONCE() - * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). - */ - sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); - } + _sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); } -#endif } -static inline void sock_rps_delete_flow(const struct sock *sk) +static inline void _sock_rps_delete_flow(const struct sock *sk) { -#ifdef CONFIG_RPS struct rps_sock_flow_table *table; u32 hash, index; - if (!static_branch_unlikely(&rfs_needed)) - return; - hash = READ_ONCE(sk->sk_rxhash); if (!hash) return; @@ -147,6 +135,45 @@ static inline void sock_rps_delete_flow(const struct sock *sk) WRITE_ONCE(table->ents[index], RPS_NO_CPU); } rcu_read_unlock(); +} +#endif /* CONFIG_RPS */ + +static inline bool rfs_is_needed(void) +{ +#ifdef CONFIG_RPS + return static_branch_unlikely(&rfs_needed); +#else + return false; +#endif +} + +static inline void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow_hash(hash); +#endif +} + +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow(sk); +#endif +} + +static inline void sock_rps_delete_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_delete_flow(sk); #endif } -- cgit v1.2.3 From 017bda80fd0ddd24ea8cf932c3bd970491e0abc1 Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Tue, 2 Sep 2025 15:46:37 +0000 Subject: genetlink: fix typo in comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this context "not that ..." should properly be "note that ...". Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250902154640.759815-4-ast@fiberby.net Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index a03d56765832..7b84f2cef8b1 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -62,7 +62,7 @@ struct genl_info; * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family * @split_ops: the split do/dump form of operation definition - * @n_split_ops: number of entries in @split_ops, not that with split do/dump + * @n_split_ops: number of entries in @split_ops, note that with split do/dump * ops the number of entries is not the same as number of commands * @sock_priv_size: the size of per-socket private memory * @sock_priv_init: the per-socket private memory initializer -- cgit v1.2.3 From 5d6b58c932ec451a5c41482790eb5b1ecf165a94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Sep 2025 18:36:03 +0000 Subject: net: lockless sock_i_ino() Followup of commit c51da3f7a161 ("net: remove sock_i_uid()") A recent syzbot report was the trigger for this change. Over the years, we had many problems caused by the read_lock[_bh](&sk->sk_callback_lock) in sock_i_uid(). We could fix smc_diag_dump_proto() or make a more radical move: Instead of waiting for new syzbot reports, cache the socket inode number in sk->sk_ino, so that we no longer need to acquire sk->sk_callback_lock in sock_i_ino(). This makes socket dumps faster (one less cache line miss, and two atomic ops avoided). Prior art: commit 25a9c8a4431c ("netlink: Add __sock_i_ino() for __netlink_diag_dump().") commit 4f9bf2a2f5aa ("tcp: Don't acquire inet_listen_hashbucket::lock with disabled BH.") commit efc3dbc37412 ("rds: Make rds_sock_lock BH rather than IRQ safe.") Fixes: d2d6422f8bd1 ("x86: Allow to enable PREEMPT_RT.") Reported-by: syzbot+50603c05bbdf4dfdaffa@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68b73804.050a0220.3db4df.01d8.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250902183603.740428-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 17 +++++++++++++---- net/core/sock.c | 22 ---------------------- net/mptcp/protocol.c | 1 - net/netlink/diag.c | 2 +- 4 files changed, 14 insertions(+), 28 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index c8a4b283df6f..fb13322a11fc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -285,6 +285,7 @@ struct sk_filter; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_ino: inode number (zero if orphaned) * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting @@ -518,6 +519,7 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; + unsigned long sk_ino; spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; @@ -2056,6 +2058,10 @@ static inline int sk_rx_queue_get(const struct sock *sk) static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk->sk_socket = sock; + if (sock) { + WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); + WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } } static inline wait_queue_head_t *sk_sleep(struct sock *sk) @@ -2077,6 +2083,7 @@ static inline void sock_orphan(struct sock *sk) sk_set_socket(sk, NULL); sk->sk_wq = NULL; /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); write_unlock_bh(&sk->sk_callback_lock); } @@ -2087,20 +2094,22 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); - WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid); security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } +static inline unsigned long sock_i_ino(const struct sock *sk) +{ + /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */ + return READ_ONCE(sk->sk_ino); +} + static inline kuid_t sk_uid(const struct sock *sk) { /* Paired with WRITE_ONCE() in sockfs_setattr() */ return READ_ONCE(sk->sk_uid); } -unsigned long __sock_i_ino(struct sock *sk); -unsigned long sock_i_ino(struct sock *sk); - static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0); diff --git a/net/core/sock.c b/net/core/sock.c index 7c26ec8dce63..158bddd23134 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2780,28 +2780,6 @@ void sock_pfree(struct sk_buff *skb) EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ -unsigned long __sock_i_ino(struct sock *sk) -{ - unsigned long ino; - - read_lock(&sk->sk_callback_lock); - ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock(&sk->sk_callback_lock); - return ino; -} -EXPORT_SYMBOL(__sock_i_ino); - -unsigned long sock_i_ino(struct sock *sk) -{ - unsigned long ino; - - local_bh_disable(); - ino = __sock_i_ino(sk); - local_bh_enable(); - return ino; -} -EXPORT_SYMBOL(sock_i_ino); - /* * Allocate a skb from the socket's send buffer. */ diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9a287b75c1b3..e6fd97b21e9e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3554,7 +3554,6 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); sk_set_socket(sk, parent); - WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid); write_unlock_bh(&sk->sk_callback_lock); } diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 61981e01fd6f..b8e58132e8af 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -168,7 +168,7 @@ mc_list: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - __sock_i_ino(sk)) < 0) { + sock_i_ino(sk)) < 0) { ret = 1; break; } -- cgit v1.2.3 From 21f82062d0f241e55dd59eb630e8710862cc90b4 Mon Sep 17 00:00:00 2001 From: Juraj Šarinay Date: Tue, 2 Sep 2025 13:36:28 +0200 Subject: net: nfc: nci: Increase NCI_DATA_TIMEOUT to 3000 ms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An exchange with a NFC target must complete within NCI_DATA_TIMEOUT. A delay of 700 ms is not sufficient for cryptographic operations on smart cards. CardOS 6.0 may need up to 1.3 seconds to perform 256-bit ECDH or 3072-bit RSA. To prevent brute-force attacks, passports and similar documents introduce even longer delays into access control protocols (BAC/PACE). The timeout should be higher, but not too much. The expiration allows us to detect that a NFC target has disappeared. Signed-off-by: Juraj Šarinay Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250902113630.62393-1-juraj@sarinay.com Signed-off-by: Jakub Kicinski --- include/net/nfc/nci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index e180bdf2f82b..664d5058e66e 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -52,7 +52,7 @@ enum nci_state { #define NCI_RF_DISC_SELECT_TIMEOUT 5000 #define NCI_RF_DEACTIVATE_TIMEOUT 30000 #define NCI_CMD_TIMEOUT 5000 -#define NCI_DATA_TIMEOUT 700 +#define NCI_DATA_TIMEOUT 3000 struct nci_dev; -- cgit v1.2.3 From 3ceb08838b576b20108d7facf6baa3dbf792afe9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Sep 2025 14:12:10 -0700 Subject: net: add helper to pre-check if PP for an Rx queue will be unreadable mlx5 pokes into the rxq state to check if the queue has a memory provider, and therefore whether it may produce unreadable mem. Add a helper for doing this in the page pool API. fbnic will want a similar thing (tho, for a slightly different reason). Reviewed-by: Mina Almasry Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250901211214.1027927-11-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 9 +-------- include/net/netdev_queues.h | 2 ++ include/net/page_pool/helpers.h | 12 ++++++++++++ net/core/netdev_rx_queue.c | 9 +++++++++ 4 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3970d0ddbcdc..714cce595692 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -780,13 +780,6 @@ static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq) bitmap_free(rq->mpwqe.shampo->bitmap); } -static bool mlx5_rq_needs_separate_hd_pool(struct mlx5e_rq *rq) -{ - struct netdev_rx_queue *rxq = __netif_get_rx_queue(rq->netdev, rq->ix); - - return !!rxq->mp_params.mp_ops; -} - static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_rq_param *rqp, @@ -825,7 +818,7 @@ static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, hd_pool_size = (rq->mpwqe.shampo->hd_per_wqe * wq_size) / MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; - if (mlx5_rq_needs_separate_hd_pool(rq)) { + if (netif_rxq_has_unreadable_mp(rq->netdev, rq->ix)) { /* Separate page pool for shampo headers */ struct page_pool_params pp_params = { }; diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index b9d02bc65c97..cd00e0406cf4 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -151,6 +151,8 @@ struct netdev_queue_mgmt_ops { int idx); }; +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); + /** * DOC: Lockless queue stopping / waking helpers. * diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index aa3719f28216..3247026e096a 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -505,6 +505,18 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +/** + * page_pool_is_unreadable() - will allocated buffers be unreadable for the CPU + * @pool: queried page pool + * + * Check if page pool will return buffers which are unreadable to the CPU / + * kernel. This will only be the case if user space bound a memory provider (mp) + * which returns unreadable memory to the queue served by the page pool. + * If %PP_FLAG_ALLOW_UNREADABLE_NETMEM was set but there is no mp bound + * this helper will return false. See also netif_rxq_has_unreadable_mp(). + * + * Return: true if memory allocated by the page pool may be unreadable + */ static inline bool page_pool_is_unreadable(struct page_pool *pool) { return !!pool->mp_ops; diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 3bf1151d8061..c7d9341b7630 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -9,6 +9,15 @@ #include "page_pool_priv.h" +/* See also page_pool_is_unreadable() */ +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); + + return !!rxq->mp_params.mp_ops; +} +EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); + int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); -- cgit v1.2.3 From 5f9d5fd8e08968e66d0212f782fc24d76e52800f Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Tue, 12 Aug 2025 12:53:28 +0530 Subject: wifi: cfg80211: fix return value in cfg80211_get_radio_idx_by_chan() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a valid radio index is not found, the function returns -ENOENT. If the channel argument itself is invalid, it returns -EINVAL. However, since the caller only checks for < 0, the distinction between these error codes is not utilized much. Also, handling these two distinct error codes throughout the codebase adds complexity, as both cases must be addressed separately. A subsequent change aims to simplify this by using a single error code for all invalid cases, making error handling more consistent and streamlined. To support this change, update the return value to -EINVAL when a valid radio index is not found. This is still appropriate because, even if the channel argument is structurally valid, the absence of a corresponding radio index implies that the argument is effectively invalid—otherwise, a valid index would have been found. Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20250812-fix_scan_ap_flag_requirement_during_mlo-v4-1-383ffb6da213@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- net/wireless/util.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 406626ff6cc8..cb1c36be2749 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -9548,7 +9548,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, * @wiphy: the wiphy * @chan: channel for which the supported radio index is required * - * Return: radio index on success or a negative error code + * Return: radio index on success or -EINVAL otherwise */ int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, const struct ieee80211_channel *chan); diff --git a/net/wireless/util.c b/net/wireless/util.c index 240c68baa3d1..d12d49134c88 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2584,7 +2584,7 @@ int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, } } - return -ENOENT; + return -EINVAL; } EXPORT_SYMBOL(cfg80211_get_radio_idx_by_chan); -- cgit v1.2.3 From d0bf06158c39e7129524dd8b43b82aed84d68faa Mon Sep 17 00:00:00 2001 From: Muna Sinada Date: Fri, 15 Aug 2025 14:30:11 -0700 Subject: wifi: nl80211: Add EHT fixed Tx rate support Add new attributes to support EHT MCS/NSS Tx rates and EHT GI/LTF. Parse EHT fixed MCS/NSS Tx rates and EHT GI/LTF values passed by the userspace, validate and add as part of cfg80211_bitrate_mask. MCS mask is constructed by new function, eht_build_mcs_mask(). Max NSS supported for MCS rates of 7, 9, 11 and 13 is utilized to set MCS bitmask for each NSS. MCS rates 14, and 15 if supported, are set only for NSS = 0. Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Signed-off-by: Muna Sinada Link: https://patch.msgid.link/20250815213011.2704803-1-muna.sinada@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 + include/uapi/linux/nl80211.h | 41 +++++++- net/wireless/nl80211.c | 229 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 266 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cb1c36be2749..7d881aa7e48b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -841,9 +841,12 @@ struct cfg80211_bitrate_mask { u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; u16 he_mcs[NL80211_HE_NSS_MAX]; + u16 eht_mcs[NL80211_EHT_NSS_MAX]; enum nl80211_txrate_gi gi; enum nl80211_he_gi he_gi; + enum nl80211_eht_gi eht_gi; enum nl80211_he_ltf he_ltf; + enum nl80211_eht_ltf eht_ltf; } control[NUM_NL80211_BANDS]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a4bc0c2729f6..4f08264bbc8e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1943,8 +1943,9 @@ enum nl80211_commands { * The driver must also specify support for this with the extended * features NL80211_EXT_FEATURE_BEACON_RATE_LEGACY, * NL80211_EXT_FEATURE_BEACON_RATE_HT, - * NL80211_EXT_FEATURE_BEACON_RATE_VHT and - * NL80211_EXT_FEATURE_BEACON_RATE_HE. + * NL80211_EXT_FEATURE_BEACON_RATE_VHT, + * NL80211_EXT_FEATURE_BEACON_RATE_HE and + * NL80211_EXT_FEATURE_BEACON_RATE_EHT. * * @NL80211_ATTR_FRAME_MATCH: A binary attribute which typically must contain * at least one byte, currently used with @NL80211_CMD_REGISTER_FRAME. @@ -3736,6 +3737,22 @@ enum nl80211_eht_gi { NL80211_RATE_INFO_EHT_GI_3_2, }; +/** + * enum nl80211_eht_ltf - EHT long training field + * @NL80211_RATE_INFO_EHT_1XLTF: 3.2 usec + * @NL80211_RATE_INFO_EHT_2XLTF: 6.4 usec + * @NL80211_RATE_INFO_EHT_4XLTF: 12.8 usec + * @NL80211_RATE_INFO_EHT_6XLTF: 19.2 usec + * @NL80211_RATE_INFO_EHT_8XLTF: 25.6 usec + */ +enum nl80211_eht_ltf { + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_2XLTF, + NL80211_RATE_INFO_EHT_4XLTF, + NL80211_RATE_INFO_EHT_6XLTF, + NL80211_RATE_INFO_EHT_8XLTF, +}; + /** * enum nl80211_eht_ru_alloc - EHT RU allocation values * @NL80211_RATE_INFO_EHT_RU_ALLOC_26: 26-tone RU allocation @@ -5482,6 +5499,10 @@ enum nl80211_key_attributes { * see &struct nl80211_txrate_he * @NL80211_TXRATE_HE_GI: configure HE GI, 0.8us, 1.6us and 3.2us. * @NL80211_TXRATE_HE_LTF: configure HE LTF, 1XLTF, 2XLTF and 4XLTF. + * @NL80211_TXRATE_EHT: EHT rates allowed for TX rate selection, + * see &struct nl80211_txrate_eht + * @NL80211_TXRATE_EHT_GI: configure EHT GI, (u8, see &enum nl80211_eht_gi) + * @NL80211_TXRATE_EHT_LTF: configure EHT LTF, (u8, see &enum nl80211_eht_ltf) * @__NL80211_TXRATE_AFTER_LAST: internal * @NL80211_TXRATE_MAX: highest TX rate attribute */ @@ -5494,6 +5515,9 @@ enum nl80211_tx_rate_attributes { NL80211_TXRATE_HE, NL80211_TXRATE_HE_GI, NL80211_TXRATE_HE_LTF, + NL80211_TXRATE_EHT, + NL80211_TXRATE_EHT_GI, + NL80211_TXRATE_EHT_LTF, /* keep last */ __NL80211_TXRATE_AFTER_LAST, @@ -5526,6 +5550,15 @@ enum nl80211_txrate_gi { NL80211_TXRATE_FORCE_LGI, }; +#define NL80211_EHT_NSS_MAX 16 +/** + * struct nl80211_txrate_eht - EHT MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_eht { + __u16 mcs[NL80211_EHT_NSS_MAX]; +}; + /** * enum nl80211_band - Frequency band * @NL80211_BAND_2GHZ: 2.4 GHz ISM band @@ -6650,6 +6683,9 @@ enum nl80211_feature_flags { * (signaling and payload protected) A-MSDUs and this shall be advertised * in the RSNXE. * + * @NL80211_EXT_FEATURE_BEACON_RATE_EHT: Driver supports beacon rate + * configuration (AP/mesh) with EHT rates. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6725,6 +6761,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_OWE_OFFLOAD_AP, NL80211_EXT_FEATURE_DFS_CONCURRENT, NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT, + NL80211_EXT_FEATURE_BEACON_RATE_EHT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893..364d83fb9992 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -411,6 +411,14 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8, NL80211_RATE_INFO_HE_1XLTF, NL80211_RATE_INFO_HE_4XLTF), + [NL80211_TXRATE_EHT] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_eht)), + [NL80211_TXRATE_EHT_GI] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_GI_0_8, + NL80211_RATE_INFO_EHT_GI_3_2), + [NL80211_TXRATE_EHT_LTF] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_8XLTF), + }; static const struct nla_policy @@ -5393,6 +5401,164 @@ static bool he_set_mcs_mask(struct genl_info *info, return true; } +static void eht_build_mcs_mask(struct genl_info *info, + const struct ieee80211_sta_eht_cap *eht_cap, + u8 mcs_nss_len, u16 *mcs_mask) +{ + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u8 nss, mcs_7 = 0, mcs_9 = 0, mcs_11 = 0, mcs_13 = 0; + unsigned int link_id = nl80211_link_id(info->attrs); + + if (mcs_nss_len == 4) { + const struct ieee80211_eht_mcs_nss_supp_20mhz_only *mcs = + &eht_cap->eht_mcs_nss_supp.only_20mhz; + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs7_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + + } else { + const struct ieee80211_eht_mcs_nss_supp_bw *mcs; + enum nl80211_chan_width width; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + width = wdev->u.ibss.chandef.width; + break; + case NL80211_IFTYPE_MESH_POINT: + width = wdev->u.mesh.chandef.width; + break; + case NL80211_IFTYPE_OCB: + width = wdev->u.ocb.chandef.width; + break; + default: + if (wdev->valid_links) + width = wdev->links[link_id].ap.chandef.width; + else + width = wdev->u.ap.preset_chandef.width; + break; + } + + switch (width) { + case NL80211_CHAN_WIDTH_320: + mcs = &eht_cap->eht_mcs_nss_supp.bw._320; + break; + case NL80211_CHAN_WIDTH_160: + mcs = &eht_cap->eht_mcs_nss_supp.bw._160; + break; + default: + mcs = &eht_cap->eht_mcs_nss_supp.bw._80; + break; + } + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + } + + /* Enable MCS 14 for NSS 0 */ + if (eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP) + mcs_mask[0] |= 0x4000; + + /* Enable MCS 15 for NSS 0 */ + mcs_mask[0] |= 0x8000; + + for (nss = 0; nss < NL80211_EHT_NSS_MAX; nss++) { + if (!mcs_7) + continue; + mcs_mask[nss] |= 0x00FF; + mcs_7--; + + if (!mcs_9) + continue; + mcs_mask[nss] |= 0x0300; + mcs_9--; + + if (!mcs_11) + continue; + mcs_mask[nss] |= 0x0C00; + mcs_11--; + + if (!mcs_13) + continue; + mcs_mask[nss] |= 0x3000; + mcs_13--; + } +} + +static bool eht_set_mcs_mask(struct genl_info *info, struct wireless_dev *wdev, + struct ieee80211_supported_band *sband, + struct nl80211_txrate_eht *txrate, + u16 mcs[NL80211_EHT_NSS_MAX]) +{ + const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u16 tx_mcs_mask[NL80211_EHT_NSS_MAX] = { 0 }; + u8 i, mcs_nss_len; + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + return false; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + return false; + + /* Checks for MCS 14 */ + if (txrate->mcs[0] & 0x4000) { + if (sband->band != NL80211_BAND_6GHZ) + return false; + + if (!(eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP)) + return false; + } + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + if (mcs_nss_len == 3) { + /* Supported iftypes for setting non-20 MHZ only EHT MCS */ + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: + break; + default: + return false; + } + } + + /* Build eht_mcs_mask from EHT and HE capabilities */ + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, tx_mcs_mask); + + memset(mcs, 0, sizeof(u16) * NL80211_EHT_NSS_MAX); + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, @@ -5413,6 +5579,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, /* Default to all rates enabled */ for (i = 0; i < NUM_NL80211_BANDS; i++) { const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u8 mcs_nss_len; if (!default_all_enabled) break; @@ -5441,6 +5609,21 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[i].he_gi = 0xFF; mask->control[i].he_ltf = 0xFF; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + continue; + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, + mask->control[i].eht_mcs); + + mask->control[i].eht_gi = 0xFF; + mask->control[i].eht_ltf = 0xFF; } /* if no rates are given set it back to the defaults */ @@ -5512,13 +5695,27 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].he_ltf = nla_get_u8(tb[NL80211_TXRATE_HE_LTF]); + if (tb[NL80211_TXRATE_EHT] && + !eht_set_mcs_mask(info, wdev, sband, + nla_data(tb[NL80211_TXRATE_EHT]), + mask->control[band].eht_mcs)) + return -EINVAL; + + if (tb[NL80211_TXRATE_EHT_GI]) + mask->control[band].eht_gi = + nla_get_u8(tb[NL80211_TXRATE_EHT_GI]); + if (tb[NL80211_TXRATE_EHT_LTF]) + mask->control[band].eht_ltf = + nla_get_u8(tb[NL80211_TXRATE_EHT_LTF]); + if (mask->control[band].legacy == 0) { - /* don't allow empty legacy rates if HT, VHT or HE + /* don't allow empty legacy rates if HT, VHT, HE or EHT * are not even supported. */ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || rdev->wiphy.bands[band]->vht_cap.vht_supported || - ieee80211_get_he_iftype_cap(sband, wdev->iftype))) + ieee80211_get_he_iftype_cap(sband, wdev->iftype) || + ieee80211_get_eht_iftype_cap(sband, wdev->iftype))) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) @@ -5533,6 +5730,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].he_mcs[i]) goto out; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) + if (mask->control[band].eht_mcs[i]) + goto out; + /* legacy and mcs rates may not be both empty */ return -EINVAL; } @@ -5546,7 +5747,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, enum nl80211_band band, struct cfg80211_bitrate_mask *beacon_rate) { - u32 count_ht, count_vht, count_he, i; + u32 count_ht, count_vht, count_he, count_eht, i; u32 rate = beacon_rate->control[band].legacy; /* Allow only one rate */ @@ -5592,8 +5793,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return -EINVAL; } - if ((count_ht && count_vht && count_he) || - (!rate && !count_ht && !count_vht && !count_he)) + count_eht = 0; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if (hweight16(beacon_rate->control[band].eht_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].eht_mcs[i]) { + count_eht++; + if (count_eht > 1) + return -EINVAL; + } + if (count_eht && rate) + return -EINVAL; + } + + if ((count_ht && count_vht && count_he && count_eht) || + (!rate && !count_ht && !count_vht && !count_he && !count_eht)) return -EINVAL; if (rate && @@ -5613,6 +5827,11 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_BEACON_RATE_HE)) return -EINVAL; + if (count_eht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_EHT)) + return -EINVAL; + return 0; } -- cgit v1.2.3 From 24185534915b5d926ded098336f47bdcca333aec Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 17 Aug 2025 21:04:32 +0200 Subject: wifi: nl80211: allow drivers to support subset of NL80211_CMD_SET_BSS The so-called fullmac devices rely on firmware functionality and/or API to change BSS parameters. Today there are limited drivers supporting the nl80211 primitive, but they only handle a subset of the bss parameters passed if any. The mac80211 driver does handle all parameters and stores their configured values. Some of the BSS parameters were already conditional by wiphy->features. For these the wiphy->bss_param_support and wiphy->features fields are silently aligned in wiphy_register(). Maybe better to issue a warning instead when they are misaligned. Signed-off-by: Arend van Spriel Link: https://patch.msgid.link/20250817190435.1495094-2-arend.vanspriel@broadcom.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 29 +++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 4 ++++ net/wireless/core.c | 9 +++++++++ net/wireless/nl80211.c | 39 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 79 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7d881aa7e48b..4072a67c9cc9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2459,6 +2459,29 @@ struct mpath_info { int generation; }; +/** + * enum wiphy_bss_param_flags - bit positions for supported bss parameters. + * + * @WIPHY_BSS_PARAM_CTS_PROT: support changing CTS protection. + * @WIPHY_BSS_PARAM_SHORT_PREAMBLE: support changing short preamble usage. + * @WIPHY_BSS_PARAM_SHORT_SLOT_TIME: support changing short slot time usage. + * @WIPHY_BSS_PARAM_BASIC_RATES: support reconfiguring basic rates. + * @WIPHY_BSS_PARAM_AP_ISOLATE: support changing AP isolation. + * @WIPHY_BSS_PARAM_HT_OPMODE: support changing HT operating mode. + * @WIPHY_BSS_PARAM_P2P_CTWINDOW: support reconfiguring ctwindow. + * @WIPHY_BSS_PARAM_P2P_OPPPS: support changing P2P opportunistic power-save. + */ +enum wiphy_bss_param_flags { + WIPHY_BSS_PARAM_CTS_PROT = BIT(0), + WIPHY_BSS_PARAM_SHORT_PREAMBLE = BIT(1), + WIPHY_BSS_PARAM_SHORT_SLOT_TIME = BIT(2), + WIPHY_BSS_PARAM_BASIC_RATES = BIT(3), + WIPHY_BSS_PARAM_AP_ISOLATE = BIT(4), + WIPHY_BSS_PARAM_HT_OPMODE = BIT(5), + WIPHY_BSS_PARAM_P2P_CTWINDOW = BIT(6), + WIPHY_BSS_PARAM_P2P_OPPPS = BIT(7), +}; + /** * struct bss_parameters - BSS parameters * @@ -5785,6 +5808,11 @@ struct wiphy_radio { * and probe responses. This value should be set if the driver * wishes to limit the number of csa counters. Default (0) means * infinite. + * @bss_param_support: bitmask indicating which bss_parameters as defined in + * &struct bss_parameters the driver can actually handle in the + * .change_bss() callback. The bit positions are defined in &enum + * wiphy_bss_param_flags. + * * @bss_select_support: bitmask indicating the BSS selection criteria supported * by the driver in the .connect() callback. The bit position maps to the * attribute indices defined in &enum nl80211_bss_select_attr. @@ -5970,6 +5998,7 @@ struct wiphy { u8 max_num_csa_counters; + u32 bss_param_support; u32 bss_select_support; u8 nan_supported_bands; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4f08264bbc8e..6c07100fc01f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2930,6 +2930,9 @@ enum nl80211_commands { * required alongside this attribute. Refer to * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. * + * @NL80211_ATTR_BSS_PARAM: nested attribute used with %NL80211_CMD_GET_WIPHY + * which indicates which BSS parameters can be modified. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3491,6 +3494,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_LONG_BEACON_PERIOD, NL80211_ATTR_S1G_SHORT_BEACON, + NL80211_ATTR_BSS_PARAM, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/core.c b/net/wireless/core.c index a7e2931ffb2e..797f9f2004a6 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1018,6 +1018,15 @@ int wiphy_register(struct wiphy *wiphy) rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_CTWIN; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_CTWINDOW; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_OPPPS; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_OPPPS; + rtnl_lock(); wiphy_lock(&rdev->wiphy); res = device_add(&rdev->wiphy.dev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 364d83fb9992..153644a04072 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3027,6 +3027,40 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.ext_features)) goto nla_put_failure; + if (rdev->wiphy.bss_param_support) { + struct nlattr *nested; + u32 parsup = rdev->wiphy.bss_param_support; + + nested = nla_nest_start(msg, NL80211_ATTR_BSS_PARAM); + if (!nested) + goto nla_put_failure; + + if ((parsup & WIPHY_BSS_PARAM_CTS_PROT) && + nla_put_flag(msg, NL80211_ATTR_BSS_CTS_PROT)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_PREAMBLE) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_PREAMBLE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_SLOT_TIME) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_SLOT_TIME)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_BASIC_RATES) && + nla_put_flag(msg, NL80211_ATTR_BSS_BASIC_RATES)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_AP_ISOLATE) && + nla_put_flag(msg, NL80211_ATTR_AP_ISOLATE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_HT_OPMODE) && + nla_put_flag(msg, NL80211_ATTR_BSS_HT_OPMODE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_CTWINDOW) && + nla_put_flag(msg, NL80211_ATTR_P2P_CTWINDOW)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_OPPPS) && + nla_put_flag(msg, NL80211_ATTR_P2P_OPPPS)) + goto nla_put_failure; + nla_nest_end(msg, nested); + } if (rdev->wiphy.bss_select_support) { struct nlattr *nested; u32 bss_select_support = rdev->wiphy.bss_select_support; @@ -9048,6 +9082,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; + u32 bss_param_support = rdev->wiphy.bss_param_support; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -9087,7 +9122,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); if (params.p2p_ctwindow != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; } @@ -9099,7 +9134,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } -- cgit v1.2.3 From 17d370a70bae277678b6ea82d71ef5892e7aaa97 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 26 Aug 2025 17:54:55 +0200 Subject: xdp, libeth: make the xdp_init_buff() micro-optimization generic Often times the compilers are not able to expand two consecutive 32-bit writes into one 64-bit on the corresponding architectures. This applies to xdp_init_buff() called for every received frame (or at least once per each 64 frames when the frag size is fixed). Move the not-so-pretty hack from libeth_xdp straight to xdp_init_buff(), but using a proper union around ::frame_sz and ::flags. The optimization is limited to LE architectures due to the structure layout. One simple example from idpf with the XDP series applied (Clang 22-git, CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE => -O2): add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-27 (-27) Function old new delta idpf_vport_splitq_napi_poll 5076 5049 -27 The perf difference with XDP_DROP is around +0.8-1% which I see as more than satisfying. Suggested-by: Simon Horman Signed-off-by: Alexander Lobakin Tested-by: Ramu R Signed-off-by: Tony Nguyen --- include/net/libeth/xdp.h | 11 +---------- include/net/xdp.h | 28 +++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index f4880b50e804..bc3507edd589 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -1274,7 +1274,6 @@ bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer * head with the Rx buffer data: data pointer, length, headroom, and * truesize/tailroom. Zeroes the flags. - * Uses faster single u64 write instead of per-field access. */ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, const struct libeth_fqe *fqe, @@ -1282,17 +1281,9 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, { const struct page *page = __netmem_to_page(fqe->netmem); -#ifdef __LIBETH_WORD_ACCESS - static_assert(offsetofend(typeof(xdp->base), flags) - - offsetof(typeof(xdp->base), frame_sz) == - sizeof(u64)); - - *(u64 *)&xdp->base.frame_sz = fqe->truesize; -#else - xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); -#endif xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, pp_page_to_nmdesc(page)->pp->p.offset, len, true); + xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); } /** diff --git a/include/net/xdp.h b/include/net/xdp.h index b40f1f96cb11..af60e11b336c 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -85,8 +85,20 @@ struct xdp_buff { void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; - u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ - u32 flags; /* supported values defined in xdp_buff_flags */ + + union { + struct { + /* frame size to deduce data_hard_end/tailroom */ + u32 frame_sz; + /* supported values defined in xdp_buff_flags */ + u32 flags; + }; + +#ifdef __LITTLE_ENDIAN + /* Used to micro-optimize xdp_init_buff(), don't use directly */ + u64 frame_sz_flags_init; +#endif + }; }; static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) @@ -118,9 +130,19 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { - xdp->frame_sz = frame_sz; xdp->rxq = rxq; + +#ifdef __LITTLE_ENDIAN + /* + * Force the compilers to initialize ::flags and assign ::frame_sz with + * one write on 64-bit LE architectures as they're often unable to do + * it themselves. + */ + xdp->frame_sz_flags_init = frame_sz; +#else + xdp->frame_sz = frame_sz; xdp->flags = 0; +#endif } static __always_inline void -- cgit v1.2.3 From ceac1fb2290d230eb83aff3761058c559440de13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:06 +0000 Subject: ipv6: snmp: do not use SNMP_MIB_SENTINEL anymore Use ARRAY_SIZE(), so that we know the limit at compile time. Following patch needs this preliminary change. Signed-off-by: Eric Dumazet Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250905165813.1470708-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 24 ++++++++++++++++++++++++ net/ipv6/proc.c | 43 ++++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 19 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 6dbd2bf8fa9c..a1624e8db1ab 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -338,6 +338,19 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } +#define snmp_get_cpu_field64_batch_cnt(buff64, stats_list, cnt, \ + mib_statistic, offset) \ +{ \ + int i, c; \ + for_each_possible_cpu(c) { \ + for (i = 0; i < cnt; i++) \ + buff64[i] += snmp_get_cpu_field64( \ + mib_statistic, \ + c, stats_list[i].entry, \ + offset); \ + } \ +} + #define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ { \ int i, c; \ @@ -349,6 +362,17 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } +#define snmp_get_cpu_field_batch_cnt(buff, stats_list, cnt, mib_statistic) \ +{ \ + int i, c; \ + for_each_possible_cpu(c) { \ + for (i = 0; i < cnt; i++) \ + buff[i] += snmp_get_cpu_field( \ + mib_statistic, \ + c, stats_list[i].entry); \ + } \ +} + static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) { u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index e96f14a36834..92ed04729c2f 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -85,7 +85,6 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_icmp6_list[] = { @@ -96,7 +95,6 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udp6_list[] = { @@ -109,7 +107,6 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udplite6_list[] = { @@ -121,7 +118,6 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) @@ -182,35 +178,37 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) */ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, - const struct snmp_mib *itemlist) + const struct snmp_mib *itemlist, + int cnt) { unsigned long buff[SNMP_MIB_MAX]; int i; if (pcpumib) { - memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); + memset(buff, 0, sizeof(unsigned long) * cnt); - snmp_get_cpu_field_batch(buff, itemlist, pcpumib); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, itemlist, cnt, pcpumib); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, buff[i]); } else { - for (i = 0; itemlist[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, - const struct snmp_mib *itemlist, size_t syncpoff) + const struct snmp_mib *itemlist, + int cnt, size_t syncpoff) { u64 buff64[SNMP_MIB_MAX]; int i; - memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX); + memset(buff64, 0, sizeof(u64) * cnt); - snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field64_batch_cnt(buff64, itemlist, cnt, mib, syncpoff); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } @@ -219,14 +217,19 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) struct net *net = (struct net *)seq->private; snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, - NULL, snmp6_icmp6_list); + NULL, snmp6_icmp6_list, + ARRAY_SIZE(snmp6_icmp6_list)); snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); snmp6_seq_show_item(seq, net->mib.udp_stats_in6, - NULL, snmp6_udp6_list); + NULL, snmp6_udp6_list, + ARRAY_SIZE(snmp6_udp6_list)); snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, - NULL, snmp6_udplite6_list); + NULL, snmp6_udplite6_list, + ARRAY_SIZE(snmp6_udplite6_list)); return 0; } @@ -236,9 +239,11 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); snmp6_seq_show_item64(seq, idev->stats.ipv6, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, - snmp6_icmp6_list); + snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list)); snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } -- cgit v1.2.3 From 20d3d26815441d03a9a15114729faaa54957baba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:13 +0000 Subject: net: snmp: remove SNMP_MIB_SENTINEL No more user of SNMP_MIB_SENTINEL, we can remove it. Also remove snmp_get_cpu_field[64]_batch() helpers. Signed-off-by: Eric Dumazet Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250905165813.1470708-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 23 ----------------------- include/net/snmp.h | 5 ----- 2 files changed, 28 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index a1624e8db1ab..380afb691c41 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -326,18 +326,6 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } #endif -#define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \ -{ \ - int i, c; \ - for_each_possible_cpu(c) { \ - for (i = 0; stats_list[i].name; i++) \ - buff64[i] += snmp_get_cpu_field64( \ - mib_statistic, \ - c, stats_list[i].entry, \ - offset); \ - } \ -} - #define snmp_get_cpu_field64_batch_cnt(buff64, stats_list, cnt, \ mib_statistic, offset) \ { \ @@ -351,17 +339,6 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } -#define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ -{ \ - int i, c; \ - for_each_possible_cpu(c) { \ - for (i = 0; stats_list[i].name; i++) \ - buff[i] += snmp_get_cpu_field( \ - mib_statistic, \ - c, stats_list[i].entry); \ - } \ -} - #define snmp_get_cpu_field_batch_cnt(buff, stats_list, cnt, mib_statistic) \ { \ int i, c; \ diff --git a/include/net/snmp.h b/include/net/snmp.h index 4cb4326dfebe..584e70742e9b 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -36,11 +36,6 @@ struct snmp_mib { .entry = _entry, \ } -#define SNMP_MIB_SENTINEL { \ - .name = NULL, \ - .entry = 0, \ -} - /* * We use unsigned longs for most mibs but u64 for ipstats. */ -- cgit v1.2.3 From 6b6dc81ee7e8ca87c71a533e1d69cf96a4f1e986 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 2 Sep 2025 06:44:59 +0000 Subject: bonding: add support for per-port LACP actor priority Introduce a new netlink attribute 'actor_port_prio' to allow setting the LACP actor port priority on a per-slave basis. This extends the existing bonding infrastructure to support more granular control over LACP negotiations. The priority value is embedded in LACPDU packets and will be used by subsequent patches to influence aggregator selection policies. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250902064501.360822-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/bonding.rst | 9 +++++++++ drivers/net/bonding/bond_3ad.c | 4 ++++ drivers/net/bonding/bond_netlink.c | 16 ++++++++++++++++ drivers/net/bonding/bond_options.c | 36 ++++++++++++++++++++++++++++++++++++ include/net/bond_3ad.h | 1 + include/net/bond_options.h | 1 + include/uapi/linux/if_link.h | 1 + 7 files changed, 68 insertions(+) (limited to 'include/net') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index a2b42ae719d2..345b60992446 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -193,6 +193,15 @@ ad_actor_sys_prio This parameter has effect only in 802.3ad mode and is available through SysFs interface. +actor_port_prio + + In an AD system, this specifies the port priority. The allowed range + is 1 - 65535. If the value is not specified, it takes 255 as the + default value. + + This parameter has effect only in 802.3ad mode and is available through + netlink interface. + ad_actor_system In an AD system, this specifies the mac-address for the actor in diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 4edc8e6b6b64..67ca78923b04 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -436,6 +436,7 @@ static void __ad_actor_update_port(struct port *port) port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr; port->actor_system_priority = BOND_AD_INFO(bond).system.sys_priority; + port->actor_port_priority = SLAVE_AD_INFO(port->slave)->port_priority; } /* Conversions */ @@ -2209,6 +2210,9 @@ void bond_3ad_bind_slave(struct slave *slave) ad_initialize_port(port, &bond->params); + /* Port priority is initialized. Update it to slave's ad info */ + SLAVE_AD_INFO(slave)->port_priority = port->actor_port_priority; + port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; /* key is determined according to the link speed, duplex and diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index e573b34a1bbc..ba71d95a82d2 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -28,6 +28,7 @@ static size_t bond_get_slave_size(const struct net_device *bond_dev, nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */ nla_total_size(sizeof(s32)) + /* IFLA_BOND_SLAVE_PRIO */ + nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_ACTOR_PORT_PRIO */ 0; } @@ -77,6 +78,10 @@ static int bond_fill_slave_info(struct sk_buff *skb, ad_port->partner_oper.port_state)) goto nla_put_failure; } + + if (nla_put_u16(skb, IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, + SLAVE_AD_INFO(slave)->port_priority)) + goto nla_put_failure; } return 0; @@ -130,6 +135,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = { [IFLA_BOND_SLAVE_QUEUE_ID] = { .type = NLA_U16 }, [IFLA_BOND_SLAVE_PRIO] = { .type = NLA_S32 }, + [IFLA_BOND_SLAVE_ACTOR_PORT_PRIO] = { .type = NLA_U16 }, }; static int bond_validate(struct nlattr *tb[], struct nlattr *data[], @@ -180,6 +186,16 @@ static int bond_slave_changelink(struct net_device *bond_dev, return err; } + if (data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO]) { + u16 ad_prio = nla_get_u16(data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO]); + + bond_opt_slave_initval(&newval, &slave_dev, ad_prio); + err = __bond_opt_set(bond, BOND_OPT_ACTOR_PORT_PRIO, &newval, + data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO], extack); + if (err) + return err; + } + return 0; } diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index c0a5eb8766b5..897022272334 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -79,6 +79,8 @@ static int bond_option_tlb_dynamic_lb_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, const struct bond_opt_value *newval); +static int bond_option_actor_port_prio_set(struct bonding *bond, + const struct bond_opt_value *newval); static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_user_port_key_set(struct bonding *bond, @@ -222,6 +224,13 @@ static const struct bond_opt_value bond_ad_actor_sys_prio_tbl[] = { { NULL, -1, 0}, }; +static const struct bond_opt_value bond_actor_port_prio_tbl[] = { + { "minval", 0, BOND_VALFLAG_MIN}, + { "maxval", 65535, BOND_VALFLAG_MAX}, + { "default", 255, BOND_VALFLAG_DEFAULT}, + { NULL, -1, 0}, +}; + static const struct bond_opt_value bond_ad_user_port_key_tbl[] = { { "minval", 0, BOND_VALFLAG_MIN | BOND_VALFLAG_DEFAULT}, { "maxval", 1023, BOND_VALFLAG_MAX}, @@ -483,6 +492,13 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { .values = bond_ad_actor_sys_prio_tbl, .set = bond_option_ad_actor_sys_prio_set, }, + [BOND_OPT_ACTOR_PORT_PRIO] = { + .id = BOND_OPT_ACTOR_PORT_PRIO, + .name = "actor_port_prio", + .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), + .values = bond_actor_port_prio_tbl, + .set = bond_option_actor_port_prio_set, + }, [BOND_OPT_AD_ACTOR_SYSTEM] = { .id = BOND_OPT_AD_ACTOR_SYSTEM, .name = "ad_actor_system", @@ -1812,6 +1828,26 @@ static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, return 0; } +static int bond_option_actor_port_prio_set(struct bonding *bond, + const struct bond_opt_value *newval) +{ + struct slave *slave; + + slave = bond_slave_get_rtnl(newval->slave_dev); + if (!slave) { + netdev_dbg(bond->dev, "%s called on NULL slave\n", __func__); + return -ENODEV; + } + + netdev_dbg(newval->slave_dev, "Setting actor_port_prio to %llu\n", + newval->value); + + SLAVE_AD_INFO(slave)->port_priority = newval->value; + bond_3ad_update_ad_actor_settings(bond); + + return 0; +} + static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval) { diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index dba369a2cf27..e9188646e22e 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -274,6 +274,7 @@ struct ad_slave_info { struct port port; /* 802.3ad port structure */ struct bond_3ad_stats stats; u16 id; + u16 port_priority; }; static inline const char *bond_3ad_churn_desc(churn_state_t state) diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 022b122a9fb6..e6eedf23aea1 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -78,6 +78,7 @@ enum { BOND_OPT_PRIO, BOND_OPT_COUPLED_CONTROL, BOND_OPT_BROADCAST_NEIGH, + BOND_OPT_ACTOR_PORT_PRIO, BOND_OPT_LAST }; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 784ace3a519c..45f56c9f95d9 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1564,6 +1564,7 @@ enum { IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, IFLA_BOND_SLAVE_PRIO, + IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, __IFLA_BOND_SLAVE_MAX, }; -- cgit v1.2.3 From e5a6643435fa4ad1e104323ec7d3e6215e2d832c Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 2 Sep 2025 06:45:00 +0000 Subject: bonding: support aggregator selection based on port priority Add a new ad_select policy 'port_priority' that uses the per-port actor priority values (set via ad_actor_port_prio) to determine aggregator selection. This allows administrators to influence which ports are preferred for aggregation by assigning different priority values, providing more flexible load balancing control in LACP configurations. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250902064501.360822-3-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/bonding.rst | 16 ++++++++++++---- drivers/net/bonding/bond_3ad.c | 27 +++++++++++++++++++++++++++ drivers/net/bonding/bond_options.c | 9 +++++---- include/net/bond_3ad.h | 1 + 4 files changed, 45 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index 345b60992446..e700bf1d095c 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -250,10 +250,18 @@ ad_select ports (slaves). Reselection occurs as described under the "bandwidth" setting, above. - The bandwidth and count selection policies permit failover of - 802.3ad aggregations when partial failure of the active aggregator - occurs. This keeps the aggregator with the highest availability - (either in bandwidth or in number of ports) active at all times. + actor_port_prio or 3 + + The active aggregator is chosen by the highest total sum of + actor port priorities across its active ports. Note this + priority is actor_port_prio, not per port prio, which is + used for primary reselect. + + The bandwidth, count and actor_port_prio selection policies permit + failover of 802.3ad aggregations when partial failure of the active + aggregator occurs. This keeps the aggregator with the highest + availability (either in bandwidth, number of ports, or total value + of port priorities) active at all times. This option was added in bonding version 3.4.0. diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 67ca78923b04..49717b7b82a2 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -747,6 +747,18 @@ static int __agg_active_ports(struct aggregator *agg) return active; } +static unsigned int __agg_ports_priority(const struct aggregator *agg) +{ + struct port *port = agg->lag_ports; + unsigned int prio = 0; + + for (; port; port = port->next_port_in_aggregator) + if (port->is_enabled) + prio += port->actor_port_priority; + + return prio; +} + /** * __get_agg_bandwidth - get the total bandwidth of an aggregator * @aggregator: the aggregator we're looking at @@ -1708,6 +1720,9 @@ static struct aggregator *ad_agg_selection_test(struct aggregator *best, * 4. Therefore, current and best both have partner replies or * both do not, so perform selection policy: * + * BOND_AD_PRIO: Select by total priority of ports. If priority + * is equal, select by count. + * * BOND_AD_COUNT: Select by count of ports. If count is equal, * select by bandwidth. * @@ -1729,6 +1744,14 @@ static struct aggregator *ad_agg_selection_test(struct aggregator *best, return best; switch (__get_agg_selection_mode(curr->lag_ports)) { + case BOND_AD_PRIO: + if (__agg_ports_priority(curr) > __agg_ports_priority(best)) + return curr; + + if (__agg_ports_priority(curr) < __agg_ports_priority(best)) + return best; + + fallthrough; case BOND_AD_COUNT: if (__agg_active_ports(curr) > __agg_active_ports(best)) return curr; @@ -1794,6 +1817,10 @@ static int agg_device_up(const struct aggregator *agg) * (slaves), and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * + * BOND_AD_PRIO: select the aggregator with highest total priority of ports + * (slaves), and reselect whenever a link state change takes place or the + * set of slaves in the bond changes. + * * FIXME: this function MUST be called with the first agg in the bond, or * __get_active_agg() won't work correctly. This function should be better * called with the bond itself, and retrieve the first agg from it. diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 897022272334..5b275cb266bc 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -162,10 +162,11 @@ static const struct bond_opt_value bond_lacp_rate_tbl[] = { }; static const struct bond_opt_value bond_ad_select_tbl[] = { - { "stable", BOND_AD_STABLE, BOND_VALFLAG_DEFAULT}, - { "bandwidth", BOND_AD_BANDWIDTH, 0}, - { "count", BOND_AD_COUNT, 0}, - { NULL, -1, 0}, + { "stable", BOND_AD_STABLE, BOND_VALFLAG_DEFAULT}, + { "bandwidth", BOND_AD_BANDWIDTH, 0}, + { "count", BOND_AD_COUNT, 0}, + { "actor_port_prio", BOND_AD_PRIO, 0}, + { NULL, -1, 0}, }; static const struct bond_opt_value bond_num_peer_notif_tbl[] = { diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index e9188646e22e..c92d4a976246 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -26,6 +26,7 @@ enum { BOND_AD_STABLE = 0, BOND_AD_BANDWIDTH = 1, BOND_AD_COUNT = 2, + BOND_AD_PRIO = 3, }; /* rx machine states(43.4.11 in the 802.3ad standard) */ -- cgit v1.2.3 From ce0b015e2619ae64b7d33fb24a6b6cadcd70c317 Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Sat, 6 Sep 2025 18:29:43 -0700 Subject: devlink: Add 'total_vfs' generic device param NICs are typically configured with total_vfs=0, forcing users to rely on external tools to enable SR-IOV (a widely used and essential feature). Add total_vfs parameter to devlink for SR-IOV max VF configurability. Enables standard kernel tools to manage SR-IOV, addressing the need for flexible VF configuration. Signed-off-by: Vlad Dumitrescu Tested-by: Kamal Heib Reviewed-by: Jiri Pirko Signed-off-by: Saeed Mahameed Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250907012953.301746-2-saeed@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-params.rst | 5 +++++ include/net/devlink.h | 4 ++++ net/devlink/param.c | 5 +++++ 3 files changed, 14 insertions(+) (limited to 'include/net') diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst index 211b58177e12..c51da4fba7e7 100644 --- a/Documentation/networking/devlink/devlink-params.rst +++ b/Documentation/networking/devlink/devlink-params.rst @@ -143,3 +143,8 @@ own name. * - ``clock_id`` - u64 - Clock ID used by the device for registering DPLL devices and pins. + * - ``total_vfs`` + - u32 + - The max number of Virtual Functions (VFs) exposed by the PF. + after reboot/pci reset, 'sriov_totalvfs' entry under the device's sysfs + directory will report this value. diff --git a/include/net/devlink.h b/include/net/devlink.h index 5f44e702c25c..8d4362f010e4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -530,6 +530,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, + DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -594,6 +595,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id" #define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64 +#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME "total_vfs" +#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/devlink/param.c b/net/devlink/param.c index 41dcc86cfd94..33134940c266 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -102,6 +102,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, + .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME, + .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From 64102d9bbc3d41dac5188b8fba75b1344c438970 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:20 +0200 Subject: netfilter: nf_tables: place base_seq in struct net This will soon be read from packet path around same time as the gencursor. Both gencursor and base_seq get incremented almost at the same time, so it makes sense to place them in the same structure. This doesn't increase struct net size on 64bit due to padding. Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 1 - include/net/netns/nftables.h | 1 + net/netfilter/nf_tables_api.c | 65 ++++++++++++++++++++------------------- 3 files changed, 34 insertions(+), 33 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 891e43a01bdc..3faa80f5d801 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1912,7 +1912,6 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; u64 tstamp; - unsigned int base_seq; unsigned int gc_seq; u8 validate_state; struct work_struct destroy_work; diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index cc8060c017d5..99dd166c5d07 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -3,6 +3,7 @@ #define _NETNS_NFTABLES_H_ struct netns_nftables { + unsigned int base_seq; u8 gencursor; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c1082de09656..9518b50695ba 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1131,11 +1131,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } -static __be16 nft_base_seq(const struct net *net) +static unsigned int nft_base_seq(const struct net *net) { - struct nftables_pernet *nft_net = nft_pernet(net); + return READ_ONCE(net->nft.base_seq); +} - return htons(nft_net->base_seq & 0xffff); +static __be16 nft_base_seq_be16(const struct net *net) +{ + return htons(nft_base_seq(net) & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -1155,7 +1158,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -1248,7 +1251,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -2030,7 +2033,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -2133,7 +2136,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -3671,7 +3674,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, - nft_base_seq(net)); + nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -3839,7 +3842,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -4050,7 +4053,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), (char *)nla_data(nla[NFTA_RULE_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); @@ -4887,7 +4890,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), flags, ctx->family, NFNETLINK_V0, - nft_base_seq(ctx->net)); + nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -5032,7 +5035,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && @@ -6209,7 +6212,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && @@ -6238,7 +6241,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) seq = cb->nlh->nlmsg_seq; nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, - table->family, NFNETLINK_V0, nft_base_seq(net)); + table->family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -6331,7 +6334,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, - NFNETLINK_V0, nft_base_seq(ctx->net)); + NFNETLINK_V0, nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -6630,7 +6633,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb, } nelems++; } - audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); out_unlock: rcu_read_unlock(); @@ -8381,7 +8384,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -8446,7 +8449,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -8480,7 +8483,7 @@ cont: idx++; } if (ctx->reset && entries) - audit_log_obj_reset(table, nft_net->base_seq, entries); + audit_log_obj_reset(table, nft_base_seq(net), entries); if (rc < 0) break; } @@ -8649,7 +8652,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); @@ -8754,9 +8757,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { - struct nftables_pernet *nft_net = nft_pernet(net); char *buf = kasprintf(gfp, "%s:%u", - table->name, nft_net->base_seq); + table->name, nft_base_seq(net)); audit_log_nfcfg(buf, family, @@ -9442,7 +9444,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -9511,7 +9513,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -9696,17 +9698,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { - struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, - NFNETLINK_V0, nft_base_seq(net)); + NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -10968,11 +10969,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - base_seq = READ_ONCE(nft_net->base_seq); + base_seq = nft_base_seq(net); while (++base_seq == 0) ; - WRITE_ONCE(nft_net->base_seq, base_seq); + WRITE_ONCE(net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); @@ -11181,7 +11182,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - nf_tables_commit_audit_log(&adl, nft_net->base_seq); + nf_tables_commit_audit_log(&adl, nft_base_seq(net)); nft_gc_seq_end(nft_net, gc_seq); nft_net->validate_state = NFT_VALIDATE_SKIP; @@ -11506,7 +11507,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) mutex_lock(&nft_net->commit_mutex); nft_net->tstamp = get_jiffies_64(); - genid_ok = genid == 0 || nft_net->base_seq == genid; + genid_ok = genid == 0 || nft_base_seq(net) == genid; if (!genid_ok) mutex_unlock(&nft_net->commit_mutex); @@ -12143,7 +12144,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); - nft_net->base_seq = 1; + net->nft.base_seq = 1; nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); -- cgit v1.2.3 From 11fe5a82e53ac3581a80c88e0e35fb8a80e15f48 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:21 +0200 Subject: netfilter: nf_tables: make nft_set_do_lookup available unconditionally This function was added for retpoline mitigation and is replaced by a static inline helper if mitigations are not enabled. Enable this helper function unconditionally so next patch can add a lookup restart mechanism to fix possible false negatives while transactions are in progress. Adding lookup restarts in nft_lookup_eval doesn't work as nft_objref would then need the same copypaste loop. This patch is separate to ease review of the actual bug fix. Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables_core.h | 10 ++-------- net/netfilter/nft_lookup.c | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 6c2f483d9828..656e784714f3 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -109,17 +109,11 @@ nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, const struct nft_set_ext * nft_hash_lookup(const struct net *net, const struct nft_set *set, const u32 *key); +#endif + const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key); -#else -static inline const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) -{ - return set->ops->lookup(net, set, key); -} -#endif /* called from nft_pipapo_avx2.c */ const struct nft_set_ext * diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 40c602ffbcba..2c6909bf1b40 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -24,11 +24,11 @@ struct nft_lookup { struct nft_set_binding binding; }; -#ifdef CONFIG_MITIGATION_RETPOLINE -const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +static const struct nft_set_ext * +__nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { +#ifdef CONFIG_MITIGATION_RETPOLINE if (set->ops == &nft_set_hash_fast_type.ops) return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) @@ -51,10 +51,17 @@ nft_set_do_lookup(const struct net *net, const struct nft_set *set, return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); +#endif return set->ops->lookup(net, set, key); } + +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + return __nft_set_do_lookup(net, set, key); +} EXPORT_SYMBOL_GPL(nft_set_do_lookup); -#endif void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, -- cgit v1.2.3 From 1827f773e416842bb0a1be93f313e02591e0b0c2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 5 Sep 2025 15:15:38 -0700 Subject: net: xdp: pass full flags to xdp_update_skb_shared_info() xdp_update_skb_shared_info() needs to update skb state which was maintained in xdp_buff / frame. Pass full flags into it, instead of breaking it out bit by bit. We will need to add a bit for unreadable frags (even tho XDP doesn't support those the driver paths may be common), at which point almost all call sites would become: xdp_update_skb_shared_info(skb, num_frags, sinfo->xdp_frags_size, MY_PAGE_SIZE * num_frags, xdp_buff_is_frag_pfmemalloc(xdp), xdp_buff_is_frag_unreadable(xdp)); Keep a helper for accessing the flags, in case we need to transform them somehow in the future (e.g. to cover up xdp_buff vs xdp_frame differences). While we are touching call callers - rename the helper to xdp_update_skb_frags_info(), previous name may have implied that it's shinfo that's updated. We are updating flags in struct sk_buff based on frags that got attched. Signed-off-by: Jakub Kicinski Acked-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20250905221539.2930285-2-kuba@kernel.org Acked-by: Stanislav Fomichev Reviewed-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 7 +++---- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 15 +++++++-------- drivers/net/ethernet/intel/ice/ice_txrx.c | 15 +++++++-------- drivers/net/ethernet/marvell/mvneta.c | 7 +++---- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 23 +++++++++++------------ drivers/net/virtio_net.c | 7 +++---- include/net/xdp.h | 23 +++++++++++------------ net/core/xdp.c | 21 ++++++++++----------- 8 files changed, 55 insertions(+), 63 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 58d579dca3f1..3e77a96e5a3e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -468,9 +468,8 @@ bnxt_xdp_build_skb(struct bnxt *bp, struct sk_buff *skb, u8 num_frags, if (!skb) return NULL; - xdp_update_skb_shared_info(skb, num_frags, - sinfo->xdp_frags_size, - BNXT_RX_PAGE_SIZE * num_frags, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, num_frags, sinfo->xdp_frags_size, + BNXT_RX_PAGE_SIZE * num_frags, + xdp_buff_get_skb_flags(xdp)); return skb; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 048c33039130..98601c62c592 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2151,10 +2151,10 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0], sizeof(skb_frag_t) * nr_frags); - xdp_update_skb_shared_info(skb, skinfo->nr_frags + nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, skinfo->nr_frags + nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); /* First buffer has already been processed, so bump ntc */ if (++rx_ring->next_to_clean == rx_ring->count) @@ -2206,10 +2206,9 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) { - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); i40e_process_rx_buffs(rx_ring, I40E_XDP_PASS, xdp); } else { diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index d2871757ec94..107632a71f3c 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1035,10 +1035,9 @@ ice_build_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); return skb; } @@ -1115,10 +1114,10 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0], sizeof(skb_frag_t) * nr_frags); - xdp_update_skb_shared_info(skb, skinfo->nr_frags + nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, skinfo->nr_frags + nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); } return skb; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 476e73e502fe..7351e98d73f4 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2416,10 +2416,9 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, skb->ip_summed = mvneta_rx_csum(pp, desc_status); if (unlikely(xdp_buff_has_frags(xdp))) - xdp_update_skb_shared_info(skb, num_frags, - sinfo->xdp_frags_size, - num_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, num_frags, sinfo->xdp_frags_size, + num_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); return skb; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index b8c609d91d11..2925ece136c4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1796,10 +1796,9 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi if (xdp_buff_has_frags(&mxbuf->xdp)) { /* sinfo->nr_frags is reset by build_skb, calculate again. */ - xdp_update_skb_shared_info(skb, wi - head_wi - 1, - sinfo->xdp_frags_size, truesize, - xdp_buff_is_frag_pfmemalloc( - &mxbuf->xdp)); + xdp_update_skb_frags_info(skb, wi - head_wi - 1, + sinfo->xdp_frags_size, truesize, + xdp_buff_get_skb_flags(&mxbuf->xdp)); for (struct mlx5e_wqe_frag_info *pwi = head_wi + 1; pwi < wi; pwi++) pwi->frag_page->frags++; @@ -2105,10 +2104,10 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w struct mlx5e_frag_page *pagep; /* sinfo->nr_frags is reset by build_skb, calculate again. */ - xdp_update_skb_shared_info(skb, frag_page - head_page, - sinfo->xdp_frags_size, truesize, - xdp_buff_is_frag_pfmemalloc( - &mxbuf->xdp)); + xdp_update_skb_frags_info(skb, frag_page - head_page, + sinfo->xdp_frags_size, + truesize, + xdp_buff_get_skb_flags(&mxbuf->xdp)); pagep = head_page; do @@ -2122,10 +2121,10 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w if (xdp_buff_has_frags(&mxbuf->xdp)) { struct mlx5e_frag_page *pagep; - xdp_update_skb_shared_info(skb, sinfo->nr_frags, - sinfo->xdp_frags_size, truesize, - xdp_buff_is_frag_pfmemalloc( - &mxbuf->xdp)); + xdp_update_skb_frags_info(skb, sinfo->nr_frags, + sinfo->xdp_frags_size, + truesize, + xdp_buff_get_skb_flags(&mxbuf->xdp)); pagep = frag_page - sinfo->nr_frags; do diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 975bdc5dab84..06708c9a979e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2185,10 +2185,9 @@ static struct sk_buff *build_skb_from_xdp_buff(struct net_device *dev, skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - xdp_frags_truesz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + xdp_frags_truesz, + xdp_buff_get_skb_flags(xdp)); return skb; } diff --git a/include/net/xdp.h b/include/net/xdp.h index af60e11b336c..976cfd2f113c 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -116,15 +116,14 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } -static __always_inline bool -xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) +static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { - return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } -static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) +static __always_inline u32 xdp_buff_get_skb_flags(const struct xdp_buff *xdp) { - xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; + return xdp->flags; } static __always_inline void @@ -294,10 +293,10 @@ static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } -static __always_inline bool -xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) +static __always_inline u32 +xdp_frame_get_skb_flags(const struct xdp_frame *frame) { - return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + return frame->flags; } #define XDP_BULK_QUEUE_SIZE 16 @@ -334,9 +333,9 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) } static inline void -xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, - unsigned int size, unsigned int truesize, - bool pfmemalloc) +xdp_update_skb_frags_info(struct sk_buff *skb, u8 nr_frags, + unsigned int size, unsigned int truesize, + u32 xdp_flags) { struct skb_shared_info *sinfo = skb_shinfo(skb); @@ -350,7 +349,7 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, skb->len += size; skb->data_len += size; skb->truesize += truesize; - skb->pfmemalloc |= pfmemalloc; + skb->pfmemalloc |= !!(xdp_flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } /* Avoids inlining WARN macro in fast-path */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 491334b9b8be..9100e160113a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -663,9 +663,8 @@ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) u32 tsize; tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, tsize, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + tsize, xdp_buff_get_skb_flags(xdp)); } skb->protocol = eth_type_trans(skb, rxq->dev); @@ -692,7 +691,7 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, struct skb_shared_info *sinfo = skb_shinfo(skb); const struct skb_shared_info *xinfo; u32 nr_frags, tsize = 0; - bool pfmemalloc = false; + u32 flags = 0; xinfo = xdp_get_shared_info_from_buff(xdp); nr_frags = xinfo->nr_frags; @@ -714,11 +713,12 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); tsize += truesize; - pfmemalloc |= page_is_pfmemalloc(page); + if (page_is_pfmemalloc(page)) + flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } - xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, - tsize, pfmemalloc); + xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize, + flags); return true; } @@ -823,10 +823,9 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, skb_metadata_set(skb, xdpf->metasize); if (unlikely(xdp_frame_has_frags(xdpf))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdpf->frame_sz, - xdp_frame_is_frag_pfmemalloc(xdpf)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdpf->frame_sz, + xdp_frame_get_skb_flags(xdpf)); /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); -- cgit v1.2.3 From 6bffdc0f88f85cd15b261286be4dc3c62ddea7d3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 5 Sep 2025 15:15:39 -0700 Subject: net: xdp: handle frags with unreadable memory We don't expect frags with unreadable memory to be presented to XDP programs today, but the XDP helpers are designed to be usable whether XDP is enabled or not. Support handling frags with unreadable memory. Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250905221539.2930285-3-kuba@kernel.org Acked-by: Stanislav Fomichev Reviewed-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/net/xdp.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 976cfd2f113c..6fd294fa6841 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -76,6 +76,11 @@ enum xdp_buff_flags { XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ + /* frags have unreadable mem, this can't be true for real XDP packets, + * but drivers may use XDP helpers to construct Rx pkt state even when + * XDP program is not attached. + */ + XDP_FLAGS_FRAGS_UNREADABLE = BIT(2), }; struct xdp_buff { @@ -121,6 +126,11 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } +static __always_inline void xdp_buff_set_frag_unreadable(struct xdp_buff *xdp) +{ + xdp->flags |= XDP_FLAGS_FRAGS_UNREADABLE; +} + static __always_inline u32 xdp_buff_get_skb_flags(const struct xdp_buff *xdp) { return xdp->flags; @@ -270,6 +280,8 @@ static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, if (unlikely(netmem_is_pfmemalloc(netmem))) xdp_buff_set_frag_pfmemalloc(xdp); + if (unlikely(netmem_is_net_iov(netmem))) + xdp_buff_set_frag_unreadable(xdp); return true; } @@ -350,6 +362,7 @@ xdp_update_skb_frags_info(struct sk_buff *skb, u8 nr_frags, skb->data_len += size; skb->truesize += truesize; skb->pfmemalloc |= !!(xdp_flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + skb->unreadable |= !!(xdp_flags & XDP_FLAGS_FRAGS_UNREADABLE); } /* Avoids inlining WARN macro in fast-path */ -- cgit v1.2.3 From 0d3c4a441686663ad34aa3d6abe8c5317d21e707 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 8 Sep 2025 10:32:32 +0300 Subject: ipv4: icmp: Pass IPv4 control block structure as an argument to __icmp_send() __icmp_send() is used to generate ICMP error messages in response to various situations such as MTU errors (i.e., "Fragmentation Required") and too many hops (i.e., "Time Exceeded"). The skb that generated the error does not necessarily come from the IPv4 layer and does not always have a valid IPv4 control block in skb->cb. Therefore, commit 9ef6b42ad6fd ("net: Add __icmp_send helper.") changed the function to take the IP options structure as argument instead of deriving it from the skb's control block. Some callers of this function such as icmp_send() pass the IP options structure from the skb's control block as in these call paths the control block is known to be valid, but other callers simply pass a zeroed structure. A subsequent patch will need __icmp_send() to access more information from the IPv4 control block (specifically, the ifindex of the input interface). As a preparation for this change, change the function to take the IPv4 control block structure as an argument instead of the IP options structure. This makes the function similar to its IPv6 counterpart that already takes the IPv6 control block structure as an argument. No functional changes intended. Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250908073238.119240-3-idosch@nvidia.com Signed-off-by: Paolo Abeni --- include/net/icmp.h | 10 ++++++---- net/ipv4/cipso_ipv4.c | 12 ++++++------ net/ipv4/icmp.c | 12 +++++++----- net/ipv4/route.c | 10 +++++----- 4 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/icmp.h b/include/net/icmp.h index caddf4a59ad1..935ee13d9ae9 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -37,10 +37,10 @@ struct sk_buff; struct net; void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, - const struct ip_options *opt); + const struct inet_skb_parm *parm); static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { - __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt); + __icmp_send(skb_in, type, code, info, IPCB(skb_in)); } #if IS_ENABLED(CONFIG_NF_NAT) @@ -48,8 +48,10 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); #else static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { - struct ip_options opts = { 0 }; - __icmp_send(skb_in, type, code, info, &opts); + struct inet_skb_parm parm; + + memset(&parm, 0, sizeof(parm)); + __icmp_send(skb_in, type, code, info, &parm); } #endif diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index c7c949c37e2d..709021197e1c 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1715,7 +1715,7 @@ validate_return: */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { - struct ip_options opt; + struct inet_skb_parm parm; int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) @@ -1726,19 +1726,19 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) * so we can not use icmp_send and IPCB here. */ - memset(&opt, 0, sizeof(opt)); - opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); + memset(&parm, 0, sizeof(parm)); + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; if (gateway) - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm); else - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm); } /** diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 863bf5023f2a..59fd0e1993a6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -594,7 +594,7 @@ relookup_failed: */ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, - const struct ip_options *opt) + const struct inet_skb_parm *parm) { struct iphdr *iph; int room; @@ -725,7 +725,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) + if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, + &parm->opt)) goto out_unlock; @@ -799,15 +800,16 @@ EXPORT_SYMBOL(__icmp_send); void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct sk_buff *cloned_skb = NULL; - struct ip_options opts = { 0 }; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; + struct inet_skb_parm parm; struct nf_conn *ct; __be32 orig_ip; + memset(&parm, 0, sizeof(parm)); ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { - __icmp_send(skb_in, type, code, info, &opts); + __icmp_send(skb_in, type, code, info, &parm); return; } @@ -823,7 +825,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) orig_ip = ip_hdr(skb_in)->saddr; dir = CTINFO2DIR(ctinfo); ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip; - __icmp_send(skb_in, type, code, info, &opts); + __icmp_send(skb_in, type, code, info, &parm); ip_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 50309f2ab132..6d27d3610c1c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1222,8 +1222,8 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { + struct inet_skb_parm parm; struct net_device *dev; - struct ip_options opt; int res; /* Recompile ip options since IPCB may not be valid anymore. @@ -1233,21 +1233,21 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb) ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; - memset(&opt, 0, sizeof(opt)); + memset(&parm, 0, sizeof(parm)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; - opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; - res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); + res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; } - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm); } static void ipv4_link_failure(struct sk_buff *skb) -- cgit v1.2.3 From 9e472d9e84b11e9f3c429eba97c2a9e74461a884 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 9 Sep 2025 02:18:50 +0100 Subject: tcp: Destroy TCP-AO, TCP-MD5 keys in .sk_destruct() Currently there are a couple of minor issues with destroying the keys tcp_v4_destroy_sock(): 1. The socket is yet in TCP bind buckets, making it reachable for incoming segments [on another CPU core], potentially available to send late FIN/ACK/RST replies. 2. There is at least one code path, where tcp_done() is called before sending RST [kudos to Bob for investigation]. This is a case of a server, that finished sending its data and just called close(). The socket is in TCP_FIN_WAIT2 and has RCV_SHUTDOWN (set by __tcp_close()) tcp_v4_do_rcv()/tcp_v6_do_rcv() tcp_rcv_state_process() /* LINUX_MIB_TCPABORTONDATA */ tcp_reset() tcp_done_with_error() tcp_done() inet_csk_destroy_sock() /* Destroys AO/MD5 keys */ /* tcp_rcv_state_process() returns SKB_DROP_REASON_TCP_ABORT_ON_DATA */ tcp_v4_send_reset() /* Sends an unsigned RST segment */ tcpdump: > 22:53:15.399377 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 33929, offset 0, flags [DF], proto TCP (6), length 60) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [F.], seq 2185658590, ack 3969644355, win 502, options [nop,nop,md5 valid], length 0 > 22:53:15.399396 00:00:01:01:00:00 > 00:00:b2:1f:00:00, ethertype IPv4 (0x0800), length 86: (tos 0x0, ttl 64, id 51951, offset 0, flags [DF], proto TCP (6), length 72) > 1.0.0.2.49848 > 1.0.0.1.34567: Flags [.], seq 3969644375, ack 2185658591, win 128, options [nop,nop,md5 valid,nop,nop,sack 1 {2185658590:2185658591}], length 0 > 22:53:16.429588 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 60: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658590, win 0, length 0 > 22:53:16.664725 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0 > 22:53:17.289832 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0 Note the signed RSTs later in the dump - those are sent by the server when the fin-wait socket gets removed from hash buckets, by the listener socket. Instead of destroying AO/MD5 info and their keys in inet_csk_destroy_sock(), slightly delay it until the actual socket .sk_destruct(). As shutdown'ed socket can yet send non-data replies, they should be signed in order for the peer to process them. Now it also matches how AO/MD5 gets destructed for TIME-WAIT sockets (in tcp_twsk_destructor()). This seems optimal for TCP-MD5, while for TCP-AO it seems to have an open problem: once RST get sent and socket gets actually destructed, there is no information on the initial sequence numbers. So, in case this last RST gets lost in the network, the server's listener socket won't be able to properly sign another RST. Nothing in RFC 1122 prescribes keeping any local state after non-graceful reset. Luckily, BGP are known to use keep alive(s). While the issue is quite minor/cosmetic, these days monitoring network counters is a common practice and getting invalid signed segments from a trusted BGP peer can get customers worried. Investigated-by: Bob Gilligan Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Signed-off-by: Dmitry Safonov Link: https://patch.msgid.link/20250909-b4-tcp-ao-md5-rst-finwait2-v5-1-9ffaaaf8b236@arista.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 4 ++++ net/ipv4/tcp.c | 27 +++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 33 ++++++++------------------------- net/ipv6/tcp_ipv6.c | 8 ++++++++ 4 files changed, 47 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0fb7923b8367..277914c4d067 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1941,6 +1941,7 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk, } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) +void tcp_md5_destruct_sock(struct sock *sk); #else static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, @@ -1957,6 +1958,9 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk, } #define tcp_twsk_md5_key(twsk) NULL +static inline void tcp_md5_destruct_sock(struct sock *sk) +{ +} #endif int tcp_md5_alloc_sigpool(void); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9c576dc9a1f7..7c6c143017ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -412,6 +412,33 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) return rate64; } +#ifdef CONFIG_TCP_MD5SIG +static void tcp_md5sig_info_free_rcu(struct rcu_head *head) +{ + struct tcp_md5sig_info *md5sig; + + md5sig = container_of(head, struct tcp_md5sig_info, rcu); + kfree(md5sig); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); +} + +void tcp_md5_destruct_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->md5sig_info) { + struct tcp_md5sig_info *md5sig; + + md5sig = rcu_dereference_protected(tp->md5sig_info, 1); + tcp_clear_md5_list(sk); + rcu_assign_pointer(tp->md5sig_info, NULL); + call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); + } +} +EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); +#endif + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1e58a8a9ff7a..17176a5d8638 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2494,6 +2494,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; + +static void tcp4_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -2509,23 +2516,12 @@ static int tcp_v4_init_sock(struct sock *sk) #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; + sk->sk_destruct = tcp4_destruct_sock; #endif return 0; } -#ifdef CONFIG_TCP_MD5SIG -static void tcp_md5sig_info_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_info *md5sig; - - md5sig = container_of(head, struct tcp_md5sig_info, rcu); - kfree(md5sig); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} -#endif - static void tcp_release_user_frags(struct sock *sk) { #ifdef CONFIG_PAGE_POOL @@ -2562,19 +2558,6 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefully empty, out_of_order_queue. */ skb_rbtree_purge(&tp->out_of_order_queue); -#ifdef CONFIG_TCP_MD5SIG - /* Clean up the MD5 key list, if any */ - if (tp->md5sig_info) { - struct tcp_md5sig_info *md5sig; - - md5sig = rcu_dereference_protected(tp->md5sig_info, 1); - tcp_clear_md5_list(sk); - call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); - rcu_assign_pointer(tp->md5sig_info, NULL); - } -#endif - tcp_ao_destroy_sock(sk, false); - /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0562e939b2e3..08dabc47a6e7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2110,6 +2110,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; + +static void tcp6_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet6_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -2125,6 +2132,7 @@ static int tcp_v6_init_sock(struct sock *sk) #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; + sk->sk_destruct = tcp6_destruct_sock; #endif return 0; -- cgit v1.2.3 From 51e547e8c89c661f6fbede4a28b1d33b13625683 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 9 Sep 2025 02:18:51 +0100 Subject: tcp: Free TCP-AO/TCP-MD5 info/keys without RCU Now that the destruction of info/keys is delayed until the socket destructor, it's safe to use kfree() without an RCU callback. The socket is in TCP_CLOSE state either because it never left it, or it's already closed and the refcounter is zero. In any way, no one can discover it anymore, it's safe to release memory straight away. Similar thing was possible for twsk already. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Dmitry Safonov Link: https://patch.msgid.link/20250909-b4-tcp-ao-md5-rst-finwait2-v5-2-9ffaaaf8b236@arista.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ao.h | 1 - net/ipv4/tcp.c | 17 +++-------------- net/ipv4/tcp_ao.c | 5 ++--- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_minisocks.c | 19 +++++-------------- 5 files changed, 12 insertions(+), 34 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index df655ce6987d..1e9e27d6e06b 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -130,7 +130,6 @@ struct tcp_ao_info { u32 snd_sne; u32 rcv_sne; refcount_t refcnt; /* Protects twsk destruction */ - struct rcu_head rcu; }; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7c6c143017ef..7f9c671b1ee0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -413,27 +413,16 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) } #ifdef CONFIG_TCP_MD5SIG -static void tcp_md5sig_info_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_info *md5sig; - - md5sig = container_of(head, struct tcp_md5sig_info, rcu); - kfree(md5sig); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} - void tcp_md5_destruct_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->md5sig_info) { - struct tcp_md5sig_info *md5sig; - md5sig = rcu_dereference_protected(tp->md5sig_info, 1); tcp_clear_md5_list(sk); - rcu_assign_pointer(tp->md5sig_info, NULL); - call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); + kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1)); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); } } EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index bbb8d5f0eae7..31302be78bc4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -268,9 +268,8 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head) kfree_sensitive(key); } -static void tcp_ao_info_free_rcu(struct rcu_head *head) +static void tcp_ao_info_free(struct tcp_ao_info *ao) { - struct tcp_ao_info *ao = container_of(head, struct tcp_ao_info, rcu); struct tcp_ao_key *key; struct hlist_node *n; @@ -310,7 +309,7 @@ void tcp_ao_destroy_sock(struct sock *sk, bool twsk) if (!twsk) tcp_ao_sk_omem_free(sk, ao); - call_rcu(&ao->rcu, tcp_ao_info_free_rcu); + tcp_ao_info_free(ao); } void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 17176a5d8638..2a0602035729 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1503,9 +1503,9 @@ void tcp_clear_md5_list(struct sock *sk) md5sig = rcu_dereference_protected(tp->md5sig_info, 1); hlist_for_each_entry_safe(key, n, &md5sig->head, node) { - hlist_del_rcu(&key->node); + hlist_del(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); - kfree_rcu(key, rcu); + kfree(key); } } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d1c9e4088646..7c2ae07d8d5d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -377,26 +377,17 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } EXPORT_SYMBOL(tcp_time_wait); -#ifdef CONFIG_TCP_MD5SIG -static void tcp_md5_twsk_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_key *key; - - key = container_of(head, struct tcp_md5sig_key, rcu); - kfree(key); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} -#endif - void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_md5_key) - call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu); + if (twsk->tw_md5_key) { + kfree(twsk->tw_md5_key); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); + } } #endif tcp_ao_destroy_sock(sk, true); -- cgit v1.2.3 From fdae0ab67d57d480dc61e9fb45678bbdc3786711 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Sep 2025 12:19:42 +0000 Subject: net: use NUMA drop counters for softnet_data.dropped Hosts under DOS attack can suffer from false sharing in enqueue_to_backlog() : atomic_inc(&sd->dropped). This is because sd->dropped can be touched from many cpus, possibly residing on different NUMA nodes. Generalize the sk_drop_counters infrastucture added in commit c51613fa276f ("net: add sk->sk_drop_counters") and use it to replace softnet_data.dropped with NUMA friendly softnet_data.drop_counters. This adds 64 bytes per cpu, maybe more in the future if we increase the number of counters (currently 2) per 'struct numa_drop_counters'. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250909121942.1202585-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 2 +- include/linux/netdevice.h | 28 +++++++++++++++++++++++++++- include/linux/udp.h | 2 +- include/net/raw.h | 2 +- include/net/sock.h | 37 ++++++++++++------------------------- net/core/dev.c | 2 +- net/core/net-procfs.c | 3 ++- 7 files changed, 45 insertions(+), 31 deletions(-) (limited to 'include/net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 261d02efb615..f43314517396 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -295,7 +295,7 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; struct ipv6_pinfo inet6; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f3a3b761abfb..f5a840c07cf1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3459,6 +3459,32 @@ static inline bool dev_has_header(const struct net_device *dev) return dev->header_ops && dev->header_ops->create; } +struct numa_drop_counters { + atomic_t drops0 ____cacheline_aligned_in_smp; + atomic_t drops1 ____cacheline_aligned_in_smp; +}; + +static inline int numa_drop_read(const struct numa_drop_counters *ndc) +{ + return atomic_read(&ndc->drops0) + atomic_read(&ndc->drops1); +} + +static inline void numa_drop_add(struct numa_drop_counters *ndc, int val) +{ + int n = numa_node_id() % 2; + + if (n) + atomic_add(val, &ndc->drops1); + else + atomic_add(val, &ndc->drops0); +} + +static inline void numa_drop_reset(struct numa_drop_counters *ndc) +{ + atomic_set(&ndc->drops0, 0); + atomic_set(&ndc->drops1, 0); +} + /* * Incoming packets are placed on per-CPU queues */ @@ -3504,7 +3530,7 @@ struct softnet_data { struct sk_buff_head input_pkt_queue; struct napi_struct backlog; - atomic_t dropped ____cacheline_aligned_in_smp; + struct numa_drop_counters drop_counters; /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; diff --git a/include/linux/udp.h b/include/linux/udp.h index 981506be1e15..6ed008ab1665 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { * the last UDP socket cacheline. */ struct hlist_node tunnel_list; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/raw.h b/include/net/raw.h index d52709139060..66c0ffeada2e 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,7 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/include/net/sock.h b/include/net/sock.h index 896bec2d2176..0fd465935334 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -102,11 +102,6 @@ struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; -struct socket_drop_counters { - atomic_t drops0 ____cacheline_aligned_in_smp; - atomic_t drops1 ____cacheline_aligned_in_smp; -}; - /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr @@ -287,7 +282,7 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter - * @sk_drop_counters: optional pointer to socket_drop_counters + * @sk_drop_counters: optional pointer to numa_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner @@ -456,7 +451,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct socket_drop_counters *sk_drop_counters; + struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -2698,18 +2693,12 @@ struct sock_skb_cb { static inline void sk_drops_add(struct sock *sk, int segs) { - struct socket_drop_counters *sdc = sk->sk_drop_counters; + struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { - int n = numa_node_id() % 2; - - if (n) - atomic_add(segs, &sdc->drops1); - else - atomic_add(segs, &sdc->drops0); - } else { + if (ndc) + numa_drop_add(ndc, segs); + else atomic_add(segs, &sk->sk_drops); - } } static inline void sk_drops_inc(struct sock *sk) @@ -2719,23 +2708,21 @@ static inline void sk_drops_inc(struct sock *sk) static inline int sk_drops_read(const struct sock *sk) { - const struct socket_drop_counters *sdc = sk->sk_drop_counters; + const struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { + if (ndc) { DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); - return atomic_read(&sdc->drops0) + atomic_read(&sdc->drops1); + return numa_drop_read(ndc); } return atomic_read(&sk->sk_drops); } static inline void sk_drops_reset(struct sock *sk) { - struct socket_drop_counters *sdc = sk->sk_drop_counters; + struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { - atomic_set(&sdc->drops0, 0); - atomic_set(&sdc->drops1, 0); - } + if (ndc) + numa_drop_reset(ndc); atomic_set(&sk->sk_drops, 0); } diff --git a/net/core/dev.c b/net/core/dev.c index 1d1650d9ecff..2522d9d8f0e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5248,7 +5248,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: - atomic_inc(&sd->dropped); + numa_drop_add(&sd->drop_counters, 1); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 4f0f0709a1cb..70e0e9a3b650 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -145,7 +145,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->processed), + numa_drop_read(&sd->drop_counters), READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ -- cgit v1.2.3 From 66048f8b3cc7e462953c04285183cdee43a1cb89 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 2 Sep 2025 14:29:33 +0800 Subject: net/cls_cgroup: Fix task_get_classid() during qdisc run During recent testing with the netem qdisc to inject delays into TCP traffic, we observed that our CLS BPF program failed to function correctly due to incorrect classid retrieval from task_get_classid(). The issue manifests in the following call stack: bpf_get_cgroup_classid+5 cls_bpf_classify+507 __tcf_classify+90 tcf_classify+217 __dev_queue_xmit+798 bond_dev_queue_xmit+43 __bond_start_xmit+211 bond_start_xmit+70 dev_hard_start_xmit+142 sch_direct_xmit+161 __qdisc_run+102 <<<<< Issue location __dev_xmit_skb+1015 __dev_queue_xmit+637 neigh_hh_output+159 ip_finish_output2+461 __ip_finish_output+183 ip_finish_output+41 ip_output+120 ip_local_out+94 __ip_queue_xmit+394 ip_queue_xmit+21 __tcp_transmit_skb+2169 tcp_write_xmit+959 __tcp_push_pending_frames+55 tcp_push+264 tcp_sendmsg_locked+661 tcp_sendmsg+45 inet_sendmsg+67 sock_sendmsg+98 sock_write_iter+147 vfs_write+786 ksys_write+181 __x64_sys_write+25 do_syscall_64+56 entry_SYSCALL_64_after_hwframe+100 The problem occurs when multiple tasks share a single qdisc. In such cases, __qdisc_run() may transmit skbs created by different tasks. Consequently, task_get_classid() retrieves an incorrect classid since it references the current task's context rather than the skb's originating task. Given that dev_queue_xmit() always executes with bh disabled, we can use softirq_count() instead to obtain the correct classid. The simple steps to reproduce this issue: 1. Add network delay to the network interface: such as: tc qdisc add dev bond0 root netem delay 1.5ms 2. Build two distinct net_cls cgroups, each with a network-intensive task 3. Initiate parallel TCP streams from both tasks to external servers. Under this specific condition, the issue reliably occurs. The kernel eventually dequeues an SKB that originated from Task-A while executing in the context of Task-B. It is worth noting that it will change the established behavior for a slightly different scenario: prior to this patch the skb will be classified with the 'new' task A classid, now with the old/original one. The bpf_get_cgroup_classid_curr() function is a more appropriate choice for this case. Signed-off-by: Yafang Shao Cc: Daniel Borkmann Cc: Thomas Graf Cc: Sebastian Andrzej Siewior Cc: Nikolay Aleksandrov Link: https://patch.msgid.link/20250902062933.30087-1-laoar.shao@gmail.com Signed-off-by: Jakub Kicinski --- include/net/cls_cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 7e78e7d6f015..668aeee9b3f6 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -63,7 +63,7 @@ static inline u32 task_get_classid(const struct sk_buff *skb) * calls by looking at the number of nested bh disable calls because * softirqs always disables bh. */ - if (in_serving_softirq()) { + if (softirq_count()) { struct sock *sk = skb_to_full_sk(skb); /* If there is an sock_cgroup_classid we'll use that. */ -- cgit v1.2.3 From a9888628cb2c768202a4530e2816da1889cc3165 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Sep 2025 18:54:15 +0200 Subject: net: dst_metadata: fix IP_DF bit not extracted from tunnel headers Both OVS and TC flower allow extracting and matching on the DF bit of the outer IP header via OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT in the OVS_KEY_ATTR_TUNNEL and TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT in the TCA_FLOWER_KEY_ENC_FLAGS respectively. Flow dissector extracts this information as FLOW_DIS_F_TUNNEL_DONT_FRAGMENT from the tunnel info key. However, the IP_TUNNEL_DONT_FRAGMENT_BIT in the tunnel key is never actually set, because the tunneling code doesn't actually extract it from the IP header. OAM and CRIT_OPT are extracted by the tunnel implementation code, same code also sets the KEY flag, if present. UDP tunnel core takes care of setting the CSUM flag if the checksum is present in the UDP header, but the DONT_FRAGMENT is not handled at any layer. Fix that by checking the bit and setting the corresponding flag while populating the tunnel info in the IP layer where it belongs. Not using __assign_bit as we don't really need to clear the bit in a just initialized field. It also doesn't seem like using __assign_bit will make the code look better. Clearly, users didn't rely on this functionality for anything very important until now. The reason why this doesn't break OVS logic is that it only matches on what kernel previously parsed out and if kernel consistently reports this bit as zero, OVS will only match on it to be zero, which sort of works. But it is still a bug that the uAPI reports and allows matching on the field that is not actually checked in the packet. And this is causing misleading -df reporting in OVS datapath flows, while the tunnel traffic actually has the bit set in most cases. This may also cause issues if a hardware properly implements support for tunnel flag matching as it will disagree with the implementation in a software path of TC flower. Fixes: 7d5437c709de ("openvswitch: Add tunneling interface.") Fixes: 1d17568e74de ("net/sched: cls_flower: add support for matching tunnel control flags") Signed-off-by: Ilya Maximets Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250909165440.229890-2-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/net/dst_metadata.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 4160731dcb6e..1fc2fb03ce3f 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -3,6 +3,7 @@ #define __NET_DST_METADATA_H 1 #include +#include #include #include #include @@ -220,9 +221,15 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, int md_size) { const struct iphdr *iph = ip_hdr(skb); + struct metadata_dst *tun_dst; + + tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, flags, tunnel_id, md_size); - return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, flags, tunnel_id, md_size); + if (tun_dst && (iph->frag_off & htons(IP_DF))) + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + tun_dst->u.tun_info.key.tun_flags); + return tun_dst; } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, -- cgit v1.2.3 From 61b2f7baa9779b12a7bf1b9800a3f2a2549a1315 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 11 Sep 2025 13:06:30 +0200 Subject: tcp: fast path functions later MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following patch will use tcp_ecn_mode_accecn(), TCP_ACCECN_CEP_INIT_OFFSET, TCP_ACCECN_CEP_ACE_MASK in __tcp_fast_path_on() to make new flag for AccECN. No functional changes. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911110642.87529-3-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 277914c4d067..e25340459ce4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -821,33 +821,6 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp) return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } -static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) -{ - /* mptcp hooks are only on the slow path */ - if (sk_is_mptcp((struct sock *)tp)) - return; - - tp->pred_flags = htonl((tp->tcp_header_len << 26) | - ntohl(TCP_FLAG_ACK) | - snd_wnd); -} - -static inline void tcp_fast_path_on(struct tcp_sock *tp) -{ - __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); -} - -static inline void tcp_fast_path_check(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && - tp->rcv_wnd && - atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && - !tp->urg_data) - tcp_fast_path_on(tp); -} - u32 tcp_delack_max(const struct sock *sk); /* Compute the actual rto_min value */ @@ -1807,6 +1780,33 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, return true; } +static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) +{ + /* mptcp hooks are only on the slow path */ + if (sk_is_mptcp((struct sock *)tp)) + return; + + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + ntohl(TCP_FLAG_ACK) | + snd_wnd); +} + +static inline void tcp_fast_path_on(struct tcp_sock *tp) +{ + __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); +} + +static inline void tcp_fast_path_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && + tp->rcv_wnd && + atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && + !tp->urg_data) + tcp_fast_path_on(tp); +} + bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time); -- cgit v1.2.3 From 30f5ca00624397d81c99515bdd43286ade93d7c8 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Thu, 11 Sep 2025 13:06:32 +0200 Subject: tcp: ecn functions in separated include file The following patches will modify ECN helpers and add AccECN herlpers, and this patch moves the existing ones into a separated include file. No functional changes. Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911110642.87529-5-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ecn.h | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 45 +------------------- net/ipv4/tcp_output.c | 56 +----------------------- 3 files changed, 118 insertions(+), 99 deletions(-) create mode 100644 include/net/tcp_ecn.h (limited to 'include/net') diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h new file mode 100644 index 000000000000..b3430557676b --- /dev/null +++ b/include/net/tcp_ecn.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _TCP_ECN_H +#define _TCP_ECN_H + +#include +#include + +#include +#include +#include +#include + +static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) +{ + if (tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_QUEUE_CWR; +} + +static inline void tcp_ecn_accept_cwr(struct sock *sk, + const struct sk_buff *skb) +{ + if (tcp_hdr(skb)->cwr) { + tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + + /* If the sender is telling us it has entered CWR, then its + * cwnd may be very low (even just 1 packet), so we should ACK + * immediately. + */ + if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + } +} + +static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) +{ + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; +} + +static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); +} + +static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); +} + +static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) + return true; + return false; +} + +/* Packet ECN state for a SYN-ACK */ +static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; + if (tcp_ecn_disabled(tp)) + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk) || + tcp_bpf_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); +} + +/* Packet ECN state for a SYN. */ +static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; + + if (!use_ecn) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } + + tp->ecn_flags = 0; + + if (use_ecn) { + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); + + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } +} + +static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) +{ + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + /* tp->ecn_flags are cleared at a later point in time when + * SYN ACK is ultimatively being received. + */ + TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); +} + +static inline void +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) +{ + if (inet_rsk(req)->ecn_ok) + th->ece = 1; +} + +#endif /* _LINUX_TCP_ECN_H */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f1be65af1a77..b2793e749cfd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -339,31 +340,6 @@ static bool tcp_in_quickack_mode(struct sock *sk) (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } -static void tcp_ecn_queue_cwr(struct tcp_sock *tp) -{ - if (tcp_ecn_mode_rfc3168(tp)) - tp->ecn_flags |= TCP_ECN_QUEUE_CWR; -} - -static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) -{ - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - - /* If the sender is telling us it has entered CWR, then its - * cwnd may be very low (even just 1 packet), so we should ACK - * immediately. - */ - if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } -} - -static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) -{ - tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; -} - static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -399,25 +375,6 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) } } -static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) -{ - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} - -static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) -{ - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} - -static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) -{ - if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) - return true; - return false; -} - static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 54b8faa3ad95..be8ceefa5332 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -38,6 +38,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include +#include #include #include @@ -319,61 +320,6 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } -/* Packet ECN state for a SYN-ACK */ -static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; - if (tcp_ecn_disabled(tp)) - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; - else if (tcp_ca_needs_ecn(sk) || - tcp_bpf_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); -} - -/* Packet ECN state for a SYN. */ -static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; - - if (!use_ecn) { - const struct dst_entry *dst = __sk_dst_get(sk); - - if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) - use_ecn = true; - } - - tp->ecn_flags = 0; - - if (use_ecn) { - if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) - INET_ECN_xmit(sk); - - TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); - } -} - -static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) -{ - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) - /* tp->ecn_flags are cleared at a later point in time when - * SYN ACK is ultimatively being received. - */ - TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); -} - -static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) -{ - if (inet_rsk(req)->ecn_ok) - th->ece = 1; -} - /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ -- cgit v1.2.3 From 6bdcb735fec6cb866b0d40634d4f23effba81074 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:43 +0300 Subject: devlink: Add a 'num_doorbells' driverinit param This parameter can be used by drivers to configure a different number of doorbells. Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Reviewed-by: Jiri Pirko Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-params.rst | 3 +++ include/net/devlink.h | 4 ++++ net/devlink/param.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'include/net') diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst index c51da4fba7e7..0a9c20d70122 100644 --- a/Documentation/networking/devlink/devlink-params.rst +++ b/Documentation/networking/devlink/devlink-params.rst @@ -148,3 +148,6 @@ own name. - The max number of Virtual Functions (VFs) exposed by the PF. after reboot/pci reset, 'sriov_totalvfs' entry under the device's sysfs directory will report this value. + * - ``num_doorbells`` + - u32 + - Controls the number of doorbells used by the device. diff --git a/include/net/devlink.h b/include/net/devlink.h index 8d4362f010e4..9e824f61e40f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -531,6 +531,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, + DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -598,6 +599,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME "total_vfs" #define DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME "num_doorbells" +#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/devlink/param.c b/net/devlink/param.c index 33134940c266..70e69523412c 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -107,6 +107,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME, .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, + .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME, + .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From 542a495cbaa6dc57a310da62b501fdf318657cad Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:25 +0200 Subject: tcp: AccECN core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change implements Accurate ECN without negotiation and AccECN Option (that will be added by later changes). Based on AccECN specifications: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN allows feeding back the number of CE (congestion experienced) marks accurately to the sender in contrast to RFC3168 ECN that can only signal one marks-seen-yes/no per RTT. Congestion control algorithms can take advantage of the accurate ECN information to fine-tune their congestion response to avoid drastic rate reduction when only mild congestion is encountered. With Accurate ECN, tp->received_ce (r.cep in AccECN spec) keeps track of how many segments have arrived with a CE mark. Accurate ECN uses ACE field (ECE, CWR, AE) to communicate the value back to the sender which updates tp->delivered_ce (s.cep) based on the feedback. This signalling channel is lossy when ACE field overflow occurs. Conservative strategy is selected here to deal with the ACE overflow, however, some strategies using the AccECN option later in the overall patchset mitigate against false overflows detected. The ACE field values on the wire are offset by TCP_ACCECN_CEP_INIT_OFFSET. Delivered_ce/received_ce count the real CE marks rather than forcing all downstream users to adapt to the wire offset. This patch uses the first 1-byte hole and the last 4-byte hole of the tcp_sock_write_txrx for 'received_ce_pending' and 'received_ce'. Also, the group size of tcp_sock_write_txrx is increased from 91 + 4 to 95 + 4 due to the new u32 received_ce member. Below are the trimmed pahole outcomes before and after this patch. [BEFORE THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* XXX 2 bytes hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 app_limited; /* 2580 4 */ u32 rcv_wnd; /* 2684 4 */ struct tcp_options_received rx_opt; /* 2688 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 161 */ } [AFTER THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 164 */ } Signed-off-by: Ilpo Järvinen Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-2-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- .../networking/net_cachelines/tcp_sock.rst | 2 + include/linux/tcp.h | 3 + include/net/tcp.h | 15 ++++ include/net/tcp_ecn.h | 53 ++++++++++- net/ipv4/tcp.c | 5 +- net/ipv4/tcp_input.c | 100 +++++++++++++++++++-- net/ipv4/tcp_output.c | 9 +- 7 files changed, 175 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 7bbda5944ee2..31313a9adccc 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -101,6 +101,8 @@ u32 prr_delivered u32 prr_out read_mostly read_mostly tcp_rate_skb_sent,tcp_newly_delivered(tx);tcp_ack,tcp_rate_gen,tcp_clean_rtx_queue(rx) u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) +u32 received_ce read_mostly read_write +u8:4 received_ce_pending read_mostly read_write u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u64 first_tx_mstamp read_write tcp_rate_skb_sent diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d103cc0e7a35..90cee6e53527 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -287,6 +287,8 @@ struct tcp_sock { */ u8 nonagle : 4,/* Disable Nagle algorithm? */ rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ + u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ + unused2:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -299,6 +301,7 @@ struct tcp_sock { u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ + u32 received_ce; /* Like the above but for rcvd CE marked pkts */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp.h b/include/net/tcp.h index e25340459ce4..bc5159fe842e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -973,6 +973,14 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCP_ACCECN_CEP_ACE_MASK 0x7 +#define TCP_ACCECN_ACE_MAX_DELTA 6 + +/* To avoid/detect middlebox interference, not all counters start at 0. + * See draft-ietf-tcpm-accurate-ecn for the latest values. + */ +#define TCP_ACCECN_CEP_INIT_OFFSET 5 + /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ @@ -1782,11 +1790,18 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { + u32 ace; + /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return; + ace = tcp_ecn_mode_accecn(tp) ? + ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) & + TCP_ACCECN_CEP_ACE_MASK) : 0; + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + (ace << 22) | ntohl(TCP_FLAG_ACK) | snd_wnd); } diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index b3430557676b..b0ed89dbad41 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -12,6 +12,7 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { + /* Do not set CWR if in AccECN mode! */ if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -19,8 +20,10 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) static inline void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) { - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; /* If the sender is telling us it has entered CWR, then its * cwnd may be very low (even just 1 packet), so we should ACK @@ -36,6 +39,52 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } +static inline u8 tcp_accecn_ace(const struct tcphdr *th) +{ + return (th->ae << 2) | (th->cwr << 1) | th->ece; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; + tp->received_ce_pending = 0; +} + +/* Updates Accurate ECN received counters from the received IP ECN field */ +static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) +{ + u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + u8 is_ce = INET_ECN_is_ce(ecnfield); + struct tcp_sock *tp = tcp_sk(sk); + + if (!INET_ECN_is_not_ect(ecnfield)) { + u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); + + /* As for accurate ECN, the TCP_ECN_SEEN flag is set by + * tcp_ecn_received_counters() when the ECN codepoint of + * received TCP data or ACK contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_SEEN; + + /* ACE counter tracks *all* segments including pure ACKs */ + tp->received_ce += pcount; + tp->received_ce_pending = min(tp->received_ce_pending + pcount, + 0xfU); + } +} + +static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) +{ + u32 wire_ace; + + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; +} + static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1f643faa8b93..16456c10e5e8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,6 +271,7 @@ #include #include #include +#include #include #include #include @@ -3406,6 +3407,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tcp_accecn_init_counters(tp); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5138,6 +5140,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5145,7 +5148,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 91 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b2793e749cfd..98782134c2f4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -360,16 +360,25 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && + tcp_ecn_mode_rfc3168(tp)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } + /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by + * tcp_data_ecn_check() when the ECN codepoint of + * received TCP data contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; } @@ -385,10 +394,64 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; - if (ece_ack) + if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } +/* Returns the ECN CE delta */ +static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, int flag) +{ + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + u32 delta, safe_delta; + u32 corrected_ace; + + /* Reordered ACK or uncertain due to lack of data to send and ts */ + if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) + return 0; + + if (!(flag & FLAG_SLOWPATH)) { + /* AccECN counter might overflow on large ACKs */ + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return 0; + } + + /* ACE field is not available during handshake */ + if (flag & FLAG_SYN_ACKED) + return 0; + + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + + corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; + delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return delta; + + safe_delta = delivered_pkts - + ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + + return safe_delta; +} + +static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 delta; + + delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + if (delta > 0) { + tcp_count_delivered_ce(tp, delta); + *flag |= FLAG_ECE; + /* Recalculate header predictor */ + if (tp->pred_flags) + tcp_fast_path_on(tp); + } + return delta; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -3744,7 +3807,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) } /* Returns the number of packets newly acked or sacked by the current ACK */ -static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, + u32 ecn_count, int flag) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3752,8 +3816,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) - NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + + if (flag & FLAG_ECE) { + if (tcp_ecn_mode_rfc3168(tp)) + ecn_count = delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count); + } return delivered; } @@ -3774,6 +3842,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 delivered = tp->delivered; u32 lost = tp->lost; int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */ u32 prior_fack; sack_state.first_sackt = 0; @@ -3881,6 +3950,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + &flag); + tcp_in_ack_event(sk, flag); if (tp->tlp_high_seq) @@ -3905,7 +3979,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tcp_newly_delivered(sk, delivered, flag); + delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag); + lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3914,12 +3989,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3940,7 +4019,7 @@ old_ack: &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); tcp_xmit_recovery(sk, rexmit); } @@ -6071,6 +6150,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); + tcp_ecn_received_counters(sk, skb); + /* We know that such packets are checksummed * on entry. */ @@ -6119,6 +6200,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); + tcp_ecn_received_counters(sk, skb); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6159,6 +6241,8 @@ validate: return; step5: + tcp_ecn_received_counters(sk, skb); + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { reason = -reason; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index be8ceefa5332..a3a6d3e91d84 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -328,7 +328,14 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_ecn_mode_rfc3168(tp)) { + if (!tcp_ecn_mode_any(tp)) + return; + + if (tcp_ecn_mode_accecn(tp)) { + INET_ECN_xmit(sk); + tcp_accecn_set_ace(th, tp); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; + } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { -- cgit v1.2.3 From 3cae34274c79e0c60ccd1c10516973af1aed2a7c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:26 +0200 Subject: tcp: accecn: AccECN negotiation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accurate ECN negotiation parts based on the specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN is negotiated using ECE, CWR and AE flags in the TCP header. TCP falls back into using RFC3168 ECN if one of the ends supports only RFC3168-style ECN. The AccECN negotiation includes reflecting IP ECN field value seen in SYN and SYNACK back using the same bits as negotiation to allow responding to SYN CE marks and to detect ECN field mangling. CE marks should not occur currently because SYN=1 segments are sent with Non-ECT in IP ECN field (but proposal exists to remove this restriction). Reflecting SYN IP ECN field in SYNACK is relatively simple. Reflecting SYNACK IP ECN field in the final/third ACK of the handshake is more challenging. Linux TCP code is not well prepared for using the final/third ACK a signalling channel which makes things somewhat complicated here. tcp_ecn sysctl can be used to select the highest ECN variant (Accurate ECN, ECN, No ECN) that is attemped to be negotiated and requested for incoming connection and outgoing connection: TCP_ECN_IN_NOECN_OUT_NOECN, TCP_ECN_IN_ECN_OUT_ECN, TCP_ECN_IN_ECN_OUT_NOECN, TCP_ECN_IN_ACCECN_OUT_ACCECN, TCP_ECN_IN_ACCECN_OUT_ECN, and TCP_ECN_IN_ACCECN_OUT_NOECN. After this patch, the size of tcp_request_sock remains unchanged and no new holes are added. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_request_sock { [...] u32 rcv_nxt; /* 352 4 */ u8 syn_tos; /* 356 1 */ /* size: 360, cachelines: 6, members: 16 */ } [AFTER THIS PATCH] struct tcp_request_sock { [...] u32 rcv_nxt; /* 352 4 */ u8 syn_tos; /* 356 1 */ bool accecn_ok; /* 357 1 */ u8 syn_ect_snt:2; /* 358: 0 1 */ u8 syn_ect_rcv:2; /* 358: 2 1 */ u8 accecn_fail_mode:4; /* 358: 4 1 */ /* size: 360, cachelines: 6, members: 20 */ } After this patch, the size of tcp_sock remains unchanged and no new holes are added. Also, 4 bits of the existing 2-byte hole are exploited. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 dup_ack_counter:2; /* 2761: 0 1 */ u8 tlp_retrans:1; /* 2761: 2 1 */ u8 unused:5; /* 2761: 3 1 */ u8 thin_lto:1; /* 2762: 0 1 */ u8 fastopen_connect:1; /* 2762: 1 1 */ u8 fastopen_no_cookie:1; /* 2762: 2 1 */ u8 fastopen_client_fail:2; /* 2762: 3 1 */ u8 frto:1; /* 2762: 5 1 */ /* XXX 2 bits hole, try to pack */ [...] u8 keepalive_probes; /* 2765 1 */ /* XXX 2 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 164 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 dup_ack_counter:2; /* 2761: 0 1 */ u8 tlp_retrans:1; /* 2761: 2 1 */ u8 syn_ect_snt:2; /* 2761: 3 1 */ u8 syn_ect_rcv:2; /* 2761: 5 1 */ u8 thin_lto:1; /* 2761: 7 1 */ u8 fastopen_connect:1; /* 2762: 0 1 */ u8 fastopen_no_cookie:1; /* 2762: 1 1 */ u8 fastopen_client_fail:2; /* 2762: 2 1 */ u8 frto:1; /* 2762: 4 1 */ /* XXX 3 bits hole, try to pack */ [...] u8 keepalive_probes; /* 2765 1 */ u8 accecn_fail_mode:4; /* 2766: 0 1 */ /* XXX 4 bits hole, try to pack */ /* XXX 1 byte hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 166 */ } Signed-off-by: Ilpo Järvinen Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-3-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 36 ++- .../networking/net_cachelines/tcp_sock.rst | 3 + include/linux/tcp.h | 8 +- include/net/tcp.h | 1 + include/net/tcp_ecn.h | 310 +++++++++++++++++++-- net/ipv4/syncookies.c | 4 + net/ipv4/sysctl_net_ipv4.c | 3 +- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 50 +++- net/ipv4/tcp_ipv4.c | 6 +- net/ipv4/tcp_minisocks.c | 24 +- net/ipv4/tcp_output.c | 10 +- net/ipv6/syncookies.c | 2 + net/ipv6/tcp_ipv6.c | 1 + 14 files changed, 400 insertions(+), 59 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 9f5891c9b07b..3d8eb54b84f9 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -443,20 +443,28 @@ tcp_early_retrans - INTEGER tcp_ecn - INTEGER Control use of Explicit Congestion Notification (ECN) by TCP. - ECN is used only when both ends of the TCP connection indicate - support for it. This feature is useful in avoiding losses due - to congestion by allowing supporting routers to signal - congestion before having to drop packets. - - Possible values are: - - = ===================================================== - 0 Disable ECN. Neither initiate nor accept ECN. - 1 Enable ECN when requested by incoming connections and - also request ECN on outgoing connection attempts. - 2 Enable ECN when requested by incoming connections - but do not request ECN on outgoing connections. - = ===================================================== + ECN is used only when both ends of the TCP connection indicate support + for it. This feature is useful in avoiding losses due to congestion by + allowing supporting routers to signal congestion before having to drop + packets. A host that supports ECN both sends ECN at the IP layer and + feeds back ECN at the TCP layer. The highest variant of ECN feedback + that both peers support is chosen by the ECN negotiation (Accurate ECN, + ECN, or no ECN). + + The highest negotiated variant for incoming connection requests + and the highest variant requested by outgoing connection + attempts: + + ===== ==================== ==================== + Value Incoming connections Outgoing connections + ===== ==================== ==================== + 0 No ECN No ECN + 1 ECN ECN + 2 ECN No ECN + 3 AccECN AccECN + 4 AccECN ECN + 5 AccECN No ECN + ===== ==================== ==================== Default: 2 diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 31313a9adccc..4f71ece7c655 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -103,6 +103,9 @@ u32 delivered read_mostly read_w u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u8:2 syn_ect_snt write_mostly read_write +u8:2 syn_ect_rcv read_mostly read_write +u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u64 first_tx_mstamp read_write tcp_rate_skb_sent diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 90cee6e53527..b8432bed546d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -168,6 +168,10 @@ struct tcp_request_sock { * after data-in-SYN. */ u8 syn_tos; + bool accecn_ok; + u8 syn_ect_snt: 2, + syn_ect_rcv: 2, + accecn_fail_mode:4; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -375,7 +379,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + syn_ect_snt:2, /* AccECN ECT memory, only */ + syn_ect_rcv:2; /* ... needed during 3WHS + first seqno */ u8 thin_lto : 1,/* Use linear timeouts for thin streams */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ @@ -391,6 +396,7 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ + u8 accecn_fail_mode:4; /* AccECN failure handling */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ diff --git a/include/net/tcp.h b/include/net/tcp.h index bc5159fe842e..da8c6640ead3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -972,6 +972,7 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR) #define TCP_ACCECN_CEP_ACE_MASK 0x7 #define TCP_ACCECN_ACE_MAX_DELTA 6 diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index b0ed89dbad41..da0b355418bd 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -4,12 +4,26 @@ #include #include +#include #include #include #include #include +/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is + * attemped to be negotiated and requested for incoming connection + * and outgoing connection, respectively. + */ +enum tcp_ecn_mode { + TCP_ECN_IN_NOECN_OUT_NOECN = 0, + TCP_ECN_IN_ECN_OUT_ECN = 1, + TCP_ECN_IN_ECN_OUT_NOECN = 2, + TCP_ECN_IN_ACCECN_OUT_ACCECN = 3, + TCP_ECN_IN_ACCECN_OUT_ECN = 4, + TCP_ECN_IN_ACCECN_OUT_NOECN = 5, +}; + static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ @@ -39,19 +53,125 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } +/* tp->accecn_fail_mode */ +#define TCP_ACCECN_ACE_FAIL_SEND BIT(0) +#define TCP_ACCECN_ACE_FAIL_RECV BIT(1) +#define TCP_ACCECN_OPT_FAIL_SEND BIT(2) +#define TCP_ACCECN_OPT_FAIL_RECV BIT(3) + +static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; +} + +static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV; +} + +static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND; +} + +static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV; +} + +static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) +{ + tp->accecn_fail_mode |= mode; +} + static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; } -static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +/* Infer the ECT value our SYN arrived with from the echoed ACE field */ +static inline int tcp_accecn_extract_syn_ect(u8 ace) { - tp->received_ce = 0; - tp->received_ce_pending = 0; + /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */ + static const int ace_to_ecn[8] = { + INET_ECN_ECT_0, /* 0b000 (Undefined) */ + INET_ECN_ECT_1, /* 0b001 (Undefined) */ + INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */ + INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */ + INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */ + INET_ECN_ECT_1, /* 0b101 (Reserved) */ + INET_ECN_CE, /* 0b110 (CE is received) */ + INET_ECN_ECT_1 /* 0b111 (Undefined) */ + }; + + return ace_to_ecn[ace & 0x7]; +} + +/* Check ECN field transition to detect invalid transitions */ +static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv) +{ + if (rcv == snt) + return true; + + /* Non-ECT altered to something or something became non-ECT */ + if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT) + return false; + /* CE -> ECT(0/1)? */ + if (snt == INET_ECN_CE) + return false; + return true; +} + +static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, + u8 sent_ect) +{ + u8 ect = tcp_accecn_extract_syn_ect(ace); + struct tcp_sock *tp = tcp_sk(sk); + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + return true; + + if (!tcp_ect_transition_valid(sent_ect, ect)) { + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + return false; + } + + return true; +} + +/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ +static inline void tcp_accecn_third_ack(struct sock *sk, + const struct sk_buff *skb, u8 sent_ect) +{ + u8 ace = tcp_accecn_ace(tcp_hdr(skb)); + struct tcp_sock *tp = tcp_sk(sk); + + switch (ace) { + case 0x0: + /* Invalid value */ + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + break; + case 0x7: + case 0x5: + case 0x1: + /* Unused but legal values */ + break; + default: + /* Validation only applies to first non-data packet */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && + !TCP_SKB_CB(skb)->sacked && + tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { + if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && + !tp->delivered_ce) + tp->delivered_ce++; + } + break; + } } /* Updates Accurate ECN received counters from the received IP ECN field */ -static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) +static inline void tcp_ecn_received_counters(struct sock *sk, + const struct sk_buff *skb) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); @@ -74,27 +194,152 @@ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_bu } } -static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) +/* AccECN specification, 5.1: [...] a server can determine that it + * negotiated AccECN as [...] if the ACK contains an ACE field with + * the value 0b010 to 0b111 (decimal 2 to 7). + */ +static inline bool cookie_accecn_ok(const struct tcphdr *th) { - u32 wire_ace; + return tcp_accecn_ace(th) > 0x1; +} + +/* Used to form the ACE flags for SYN/ACK */ +static inline u16 tcp_accecn_reflector_flags(u8 ect) +{ + /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN. + * Below is an excerpt from the 1st block of Table 2 of AccECN spec, + * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE + */ + static const u8 ecn_to_ace_flags[4] = { + 0b010, /* Not-ECT is received */ + 0b011, /* ECT(1) is received */ + 0b100, /* ECT(0) is received */ + 0b110 /* CE is received */ + }; + + return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]); +} - wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; - th->ece = !!(wire_ace & 0x1); - th->cwr = !!(wire_ace & 0x2); - th->ae = !!(wire_ace & 0x4); +/* AccECN specification, 3.1.2: If a TCP server that implements AccECN + * receives a SYN with the three TCP header flags (AE, CWR and ECE) set + * to any combination other than 000, 011 or 111, it MUST negotiate the + * use of AccECN as if they had been set to 111. + */ +static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) +{ + u8 ace = tcp_accecn_ace(th); + + return ace && ace != 0x3; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; tp->received_ce_pending = 0; } -static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, - const struct tcphdr *th) +/* Used for make_synack to form the ACE flags */ +static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) { - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received + * from SYN. Below is an excerpt from Table 2 of the AccECN spec: + * +====================+====================================+ + * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK | + * | received on SYN | AE CWR ECE | + * +====================+====================================+ + * | Not-ECT | 0 1 0 | + * | ECT(1) | 0 1 1 | + * | ECT(0) | 1 0 0 | + * | CE | 1 1 0 | + * +====================+====================================+ + */ + th->ae = !!(ect & INET_ECN_ECT_0); + th->cwr = ect != INET_ECN_ECT_0; + th->ece = ect == INET_ECN_ECT_1; +} + +static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, + struct tcphdr *th) +{ + u32 wire_ace; + + /* The final packet of the 3WHS or anything like it must reflect + * the SYN/ACK ECT instead of putting CEP into ACE field, such + * case show up in tcp_flags. + */ + if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) { + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; + } +} + +/* See Table 2 of the AccECN draft */ +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, + u8 ip_dsfield) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 ace = tcp_accecn_ace(th); + + switch (ace) { + case 0x0: + case 0x7: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | No ECN | 0 0 0 | Not ECN | + * | AccECN | Broken | 1 1 1 | Not ECN | + * +========+========+============+=============+ + */ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + break; + case 0x1: + case 0x5: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | Nonce | 1 0 1 | (Reserved) | + * | AccECN | ECN | 0 0 1 | Classic ECN | + * | Nonce | AccECN | 0 0 1 | Classic ECN | + * | ECN | AccECN | 0 0 1 | Classic ECN | + * +========+========+============+=============+ + */ + if (tcp_ecn_mode_pending(tp)) + /* Downgrade from AccECN, or requested initially */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + break; + default: + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + if (INET_ECN_is_ce(ip_dsfield) && + tcp_accecn_validate_syn_feedback(sk, ace, + tp->syn_ect_snt)) { + tp->received_ce++; + tp->received_ce_pending++; + } + break; + } } -static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, - const struct tcphdr *th) +static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, + const struct sk_buff *skb) { + if (tcp_ecn_mode_pending(tp)) { + if (!tcp_accecn_syn_requested(th)) { + /* Downgrade to classic ECN feedback */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } else { + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + } + } if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } @@ -110,7 +355,7 @@ static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, /* Packet ECN state for a SYN-ACK */ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { - const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (tcp_ecn_disabled(tp)) @@ -118,6 +363,13 @@ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); + + if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + TCP_SKB_CB(skb)->tcp_flags |= + tcp_accecn_reflector_flags(tp->syn_ect_rcv); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } } /* Packet ECN state for a SYN. */ @@ -125,8 +377,13 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; + bool use_ecn, use_accecn; + u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn); + + use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN; + use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || + tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); @@ -142,23 +399,32 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) INET_ECN_xmit(sk); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + if (use_accecn) { + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } else { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } } } static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) { /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ - TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + } } static inline void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { - if (inet_rsk(req)->ecn_ok) + if (tcp_rsk(req)->accecn_ok) + tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); + else if (inet_rsk(req)->ecn_ok) th->ece = 1; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eb0819463fae..569befcf021b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -12,6 +12,7 @@ #include #include #include +#include #include static siphash_aligned_key_t syncookie_secret[2]; @@ -403,6 +404,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); + struct tcp_request_sock *treq; struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4; @@ -428,6 +430,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } ireq = inet_rsk(req); + treq = tcp_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); @@ -483,6 +486,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); + treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3a43010d726f..268f8b86e8a7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,6 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; +static int tcp_ecn_mode_max = 2; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -728,7 +729,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, + .extra2 = &tcp_ecn_mode_max, }, { .procname = "tcp_ecn_fallback", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 16456c10e5e8..7261ee6dd875 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3407,6 +3407,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 98782134c2f4..8449a5a3e368 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3665,8 +3665,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } +static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 flags = 0; + + if (accecn_reflector) + flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv); + __tcp_send_ack(sk, tp->rcv_nxt, flags); +} + /* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk) +static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -3696,7 +3706,7 @@ static void tcp_send_challenge_ack(struct sock *sk) WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, accecn_reflector); } } @@ -3863,7 +3873,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; } goto old_ack; @@ -5907,6 +5917,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); + bool accecn_reflector = false; SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ @@ -6004,7 +6015,7 @@ step1: if (tp->syn_fastopen && !tp->data_segs_in && sk->sk_state == TCP_ESTABLISHED) tcp_fastopen_active_disable(sk); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); SKB_DR_SET(reason, TCP_RESET); goto discard; } @@ -6015,6 +6026,8 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { + if (tcp_ecn_mode_accecn(tp)) + accecn_reflector = true; if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && @@ -6024,7 +6037,7 @@ syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, accecn_reflector); SKB_DR_SET(reason, TCP_INVALID_SYN); goto discard; } @@ -6493,7 +6506,8 @@ consume: * state to ESTABLISHED..." */ - tcp_ecn_rcv_synack(tp, th); + if (tcp_ecn_mode_any(tp)) + tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -6565,7 +6579,7 @@ consume: TCP_DELACK_MAX, false); goto consume; } - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp)); return -1; } @@ -6624,7 +6638,7 @@ consume: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - tcp_ecn_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -6806,7 +6820,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } /* accept old ack during closing */ if ((int)reason < 0) { - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); reason = -reason; goto discard; } @@ -6853,9 +6867,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); + if (tcp_ecn_mode_accecn(tp)) + tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); if (sk->sk_shutdown & SEND_SHUTDOWN) tcp_shutdown(sk, SEND_SHUTDOWN); + break; case TCP_FIN_WAIT1: { @@ -7025,6 +7042,15 @@ static void tcp_ecn_create_request(struct request_sock *req, bool ect, ecn_ok; u32 ecn_ok_dst; + if (tcp_accecn_syn_requested(th) && + READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { + inet_rsk(req)->ecn_ok = 1; + tcp_rsk(req)->accecn_ok = 1; + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + return; + } + if (!th_ecn) return; @@ -7032,7 +7058,8 @@ static void tcp_ecn_create_request(struct request_sock *req, ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; - if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + if (((!ect || th->res1 || th->ae) && ecn_ok) || + tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; @@ -7050,6 +7077,9 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; + tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->syn_ect_rcv = 0; + tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2a0602035729..6162f8dbe9d2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -1189,7 +1190,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { - const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; @@ -1202,6 +1203,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); tos = READ_ONCE(inet_sk(sk)->tos); @@ -3558,7 +3560,7 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { - net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c2ae07d8d5d..a4b8be6fdcdc 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -451,12 +452,23 @@ void tcp_openreq_init_rwin(struct request_sock *req, ireq->rcv_wscale = rcv_wscale; } -static void tcp_ecn_openreq_child(struct tcp_sock *tp, - const struct request_sock *req) +static void tcp_ecn_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + const struct tcp_request_sock *treq = tcp_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + + if (treq->accecn_ok) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tcp_ecn_received_counters(sk, skb); + } else { + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); + } } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -621,7 +633,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); + tcp_ecn_openreq_child(newsk, req, skb); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a3a6d3e91d84..deb9b085a8a2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -332,8 +332,9 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, return; if (tcp_ecn_mode_accecn(tp)) { - INET_ECN_xmit(sk); - tcp_accecn_set_ace(th, tp); + if (!tcp_accecn_ace_fail_recv(tp)) + INET_ECN_xmit(sk); + tcp_accecn_set_ace(tp, skb, th); skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ @@ -3356,7 +3357,10 @@ start: tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback */ + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check covers both + * cases. + */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f0ee1a909771..7e007f013ec8 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -16,6 +16,7 @@ #include #include #include +#include #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -264,6 +265,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); + tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, dst); if (!ret) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 08dabc47a6e7..5f0a138f4220 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -544,6 +544,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK; __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); -- cgit v1.2.3 From 9a011277445583bab002fbf5043fab0ea03dc5dd Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:27 +0200 Subject: tcp: accecn: add AccECN rx byte counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These three byte counters track IP ECN field payload byte sums for all arriving (acceptable) packets for ECT0, ECT1, and CE. The AccECN option (added by a later patch in the series) echoes these counters back to sender side; therefore, it is placed within the group of tcp_sock_write_txrx. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_txrx is increased from 95 + 4 to 107 + 4 and an extra 4-byte hole is created but will be exploited in later patches: [BEFORE THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 166 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 received_ecn_bytes[3];/* 2584 12 */ u32 app_limited; /* 2596 4 */ u32 rcv_wnd; /* 2600 4 */ struct tcp_options_received rx_opt; /* 2604 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 167 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-4-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- .../networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 4 +++ include/net/tcp_ecn.h | 29 +++++++++++++++++++++- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_input.c | 7 +++--- net/ipv4/tcp_minisocks.c | 2 +- 6 files changed, 40 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 4f71ece7c655..5a2b0af57364 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -102,6 +102,7 @@ u32 prr_out read_mostly read_m u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write +u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b8432bed546d..012d01347b3c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -306,6 +306,10 @@ struct tcp_sock { u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 received_ce; /* Like the above but for rcvd CE marked pkts */ + u32 received_ecn_bytes[3]; /* received byte counters for three ECN + * types: INET_ECN_ECT_1, INET_ECN_ECT_0, + * and INET_ECN_CE + */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index da0b355418bd..1a41a459aa07 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -171,7 +171,7 @@ static inline void tcp_accecn_third_ack(struct sock *sk, /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, u32 len) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); @@ -191,9 +191,24 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce += pcount; tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); + + if (len > 0) + tp->received_ecn_bytes[ecnfield - 1] += len; } } +/* AccECN specification, 2.2: [...] A Data Receiver maintains four counters + * initialized at the start of the half-connection. [...] These byte counters + * reflect only the TCP payload length, excluding TCP header and TCP options. + */ +static inline void tcp_ecn_received_counters_payload(struct sock *sk, + const struct sk_buff *skb) +{ + const struct tcphdr *th = (const struct tcphdr *)skb->data; + + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); +} + /* AccECN specification, 5.1: [...] a server can determine that it * negotiated AccECN as [...] if the ACK contains an ACE field with * the value 0b010 to 0b111 (decimal 2 to 7). @@ -232,10 +247,22 @@ static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) return ace && ace != 0x3; } +static inline void __tcp_accecn_init_bytes_counters(int *counter_array) +{ + BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); + BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); + BUILD_BUG_ON(INET_ECN_CE != 0x3); + + counter_array[INET_ECN_ECT_1 - 1] = 0; + counter_array[INET_ECN_ECT_0 - 1] = 0; + counter_array[INET_ECN_CE - 1] = 0; +} + static inline void tcp_accecn_init_counters(struct tcp_sock *tp) { tp->received_ce = 0; tp->received_ce_pending = 0; + __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); } /* Used for make_synack to form the ACE flags */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7261ee6dd875..a45a4184b603 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5142,6 +5142,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5149,7 +5150,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 107 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8449a5a3e368..636a63383412 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6163,7 +6163,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, 0); /* We know that such packets are checksummed * on entry. @@ -6213,7 +6213,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, + len - tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6254,7 +6255,7 @@ validate: return; step5: - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters_payload(sk, skb); reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a4b8be6fdcdc..1dbcc09ff7a9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,7 +463,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? TCP_ECN_MODE_RFC3168 : -- cgit v1.2.3 From b5e74132dfbe60329b3ff0e5c485039f2e31605c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:30 +0200 Subject: tcp: accecn: AccECN option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Accurate ECN allows echoing back the sum of bytes for each IP ECN field value in the received packets using AccECN option. This change implements AccECN option tx & rx side processing without option send control related features that are added by a later change. Based on specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt (Some features of the spec will be added in the later changes rather than in this one). A full-length AccECN option is always attempted but if it does not fit, the minimum length is selected based on the counters that have changed since the last update. The AccECN option (with 24-bit fields) often ends in odd sizes so the option write code tries to take advantage of some nop used to pad the other TCP options. The delivered_ecn_bytes pairs with received_ecn_bytes similar to how delivered_ce pairs with received_ce. In contrast to ACE field, however, the option is not always available to update delivered_ecn_bytes. For ACK w/o AccECN option, the delivered bytes calculated based on the cumulative ACK+SACK information are assigned to one of the counters using an estimation heuristic to select the most likely ECN byte counter. Any estimation error is corrected when the next AccECN option arrives. It may occur that the heuristic gets too confused when there are enough different byte counter deltas between ACKs with the AccECN option in which case the heuristic just gives up on updating the counters for a while. tcp_ecn_option sysctl can be used to select option sending mode for AccECN: TCP_ECN_OPTION_DISABLED, TCP_ECN_OPTION_MINIMUM, and TCP_ECN_OPTION_FULL. This patch increases the size of tcp_info struct, as there is no existing holes for new u32 variables. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ /* size: 248, cachelines: 4, members: 61 */ } [AFTER THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ __u32 tcpi_received_ce; /* 248 4 */ __u32 tcpi_delivered_e1_bytes; /* 252 4 */ __u32 tcpi_delivered_e0_bytes; /* 256 4 */ __u32 tcpi_delivered_ce_bytes; /* 260 4 */ __u32 tcpi_received_e1_bytes; /* 264 4 */ __u32 tcpi_received_e0_bytes; /* 268 4 */ __u32 tcpi_received_ce_bytes; /* 272 4 */ /* size: 280, cachelines: 5, members: 68 */ } This patch uses the existing 1-byte holes in the tcp_sock_write_txrx group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx group after the new u32 delivered_ecn_bytes[3] member. Therefore, the group size of tcp_sock_write_rx is increased from 96 to 112. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4; /* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2728 0 */ [...] /* size: 3200, cachelines: 50, members: 167 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ u32 delivered_ecn_bytes[3];/* 2672 12 */ /* XXX 4 bytes hole, try to pack */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2744 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-7-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 19 +++ .../networking/net_cachelines/tcp_sock.rst | 3 + include/linux/tcp.h | 9 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 13 ++ include/net/tcp_ecn.h | 89 ++++++++++- include/uapi/linux/tcp.h | 7 + net/ipv4/sysctl_net_ipv4.c | 9 ++ net/ipv4/tcp.c | 15 +- net/ipv4/tcp_input.c | 94 +++++++++++- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 165 ++++++++++++++++++++- 12 files changed, 412 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 3d8eb54b84f9..1c206501b973 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -468,6 +468,25 @@ tcp_ecn - INTEGER Default: 2 +tcp_ecn_option - INTEGER + Control Accurate ECN (AccECN) option sending when AccECN has been + successfully negotiated during handshake. Send logic inhibits + sending AccECN options regarless of this setting when no AccECN + option has been seen for the reverse direction. + + Possible values are: + + = ============================================================ + 0 Never send AccECN option. This also disables sending AccECN + option in SYN/ACK during handshake. + 1 Send AccECN option sparingly according to the minimum option + rules outlined in draft-ietf-tcpm-accurate-ecn. + 2 Send AccECN option on every packet whenever it fits into TCP + option space. + = ============================================================ + + Default: 2 + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 5a2b0af57364..b941151f8c0a 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -104,8 +104,11 @@ u32 delivered_ce read_mostly read_w u32 received_ce read_mostly read_write u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u32[3] delivered_ecn_bytes read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write +u8:2 accecn_minlen write_mostly read_write +u8:2 est_ecnfield read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 012d01347b3c..73557656cb2d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -122,8 +122,9 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ - u8 saw_unknown:1, /* Received unknown option */ - unused:7; + u8 accecn:6, /* AccECN index in header, 0=no options */ + saw_unknown:1, /* Received unknown option */ + unused:1; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -293,6 +294,9 @@ struct tcp_sock { rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ unused2:4; + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ + est_ecnfield:2,/* ECN field for AccECN delivered estimates */ + unused3:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -337,6 +341,7 @@ struct tcp_sock { u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_rtt_last_tsecr; + u32 delivered_ecn_bytes[3]; u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 54a7d187f62a..acbb7dd497e1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -148,6 +148,7 @@ struct netns_ipv4 { struct local_ports ip_local_ports; u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_option; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index da8c6640ead3..6be29129465e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -213,6 +213,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ +#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */ +#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -230,6 +232,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 +#define TCPOLEN_ACCECN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 @@ -243,6 +246,13 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 +#define TCPOLEN_ACCECN_PERFIELD 3 + +/* Maximum number of byte counters in AccECN option + size */ +#define TCP_ACCECN_NUMFIELDS 3 +#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ + TCPOLEN_ACCECN_PERFIELD * \ + TCP_ACCECN_NUMFIELDS) /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -981,6 +991,9 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) * See draft-ietf-tcpm-accurate-ecn for the latest values. */ #define TCP_ACCECN_CEP_INIT_OFFSET 5 +#define TCP_ACCECN_E1B_INIT_OFFSET 1 +#define TCP_ACCECN_E0B_INIT_OFFSET 1 +#define TCP_ACCECN_CEB_INIT_OFFSET 0 /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 1a41a459aa07..08c7f4757e4e 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -24,6 +24,13 @@ enum tcp_ecn_mode { TCP_ECN_IN_ACCECN_OUT_NOECN = 5, }; +/* AccECN option sending when AccECN has been successfully negotiated */ +enum tcp_accecn_option { + TCP_ACCECN_OPTION_DISABLED = 0, + TCP_ACCECN_OPTION_MINIMUM = 1, + TCP_ACCECN_OPTION_FULL = 2, +}; + static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ @@ -169,6 +176,79 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } } +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). + */ +static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return 1; + case INET_ECN_CE: + return 2; + case INET_ECN_ECT_0: + return 3; + } + return 0; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field value offset. + * Some fields do not start from zero, to detect zeroing by middleboxes. + */ +static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return TCP_ACCECN_E1B_INIT_OFFSET; + case INET_ECN_CE: + return TCP_ACCECN_CEB_INIT_OFFSET; + case INET_ECN_ECT_0: + return TCP_ACCECN_E0B_INIT_OFFSET; + } + return 0; +} + +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, + bool order) +{ + /* Based on Table 5 of the AccECN spec to map (option, order) to + * the corresponding ECN conuters (ECT-1, ECT-0, or CE). + */ + static const u8 optfield_lookup[2][3] = { + /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ + { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, + /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ + { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } + }; + + return optfield_lookup[order][option % 3]; +} + +/* Handles AccECN option ECT and CE 24-bit byte counters update into + * the u32 value in tcp_sock. As we're processing TCP options, it is + * safe to access from - 1. + */ +static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, + u32 init_offset) +{ + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & + 0xFFFFFFU; + u32 delta = (truncated - *cnt) & 0xFFFFFFU; + + /* If delta has the highest bit set (24th bit) indicating + * negative, sign extend to correct an estimation using + * sign_extend32(delta, 24 - 1) + */ + delta = sign_extend32(delta, 23); + *cnt += delta; + return (s32)delta; +} + /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, u32 len) @@ -192,8 +272,12 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); - if (len > 0) + if (len > 0) { + u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); tp->received_ecn_bytes[ecnfield - 1] += len; + tp->accecn_minlen = max_t(u8, tp->accecn_minlen, + minlen); + } } } @@ -263,6 +347,9 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) tp->received_ce = 0; tp->received_ce_pending = 0; __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); + __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); + tp->accecn_minlen = 0; + tp->est_ecnfield = 0; } /* Used for make_synack to form the ACE flags */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index bdac8c42fa82..53e0e85b52be 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -316,6 +316,13 @@ struct tcp_info { * in milliseconds, including any * unfinished recovery. */ + __u32 tcpi_received_ce; /* # of CE marks received */ + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ + __u32 tcpi_delivered_e0_bytes; + __u32 tcpi_delivered_ce_bytes; + __u32 tcpi_received_e1_bytes; + __u32 tcpi_received_e0_bytes; + __u32 tcpi_received_ce_bytes; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 268f8b86e8a7..4a697acb4e85 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -731,6 +731,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_ecn_mode_max, }, + { + .procname = "tcp_ecn_option", + .data = &init_net.ipv4.sysctl_tcp_ecn_option, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a45a4184b603..8c4a4b8666fc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include #include +#include #include #include #include @@ -4155,6 +4156,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; @@ -4281,6 +4285,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_received_ce = tp->received_ce; + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -5162,12 +5174,13 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 112); } void __init tcp_init(void) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8b48a3c00945..e898a76c485e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -384,6 +385,73 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) } } +/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp, + const struct sk_buff *skb, + u32 delivered_bytes, int flag) +{ + u8 estimate_ecnfield = tp->est_ecnfield; + bool ambiguous_ecn_bytes_incr = false; + bool first_changed = false; + unsigned int optlen; + bool order1, res; + unsigned int i; + u8 *ptr; + + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (estimate_ecnfield) { + u8 ecnfield = estimate_ecnfield - 1; + + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; + return true; + } + return false; + } + + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; + optlen = ptr[1] - 2; + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return false; + order1 = (ptr[0] == TCPOPT_ACCECN1); + ptr += 2; + + res = !!estimate_ecnfield; + for (i = 0; i < 3; i++) { + u32 init_offset; + u8 ecnfield; + s32 delta; + u32 *cnt; + + if (optlen < TCPOLEN_ACCECN_PERFIELD) + break; + + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); + init_offset = tcp_accecn_field_init_offset(ecnfield); + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); + if (delta && delta < 0) { + res = false; + ambiguous_ecn_bytes_incr = true; + } + if (delta && ecnfield != estimate_ecnfield) { + if (!first_changed) { + tp->est_ecnfield = ecnfield; + first_changed = true; + } else { + res = false; + ambiguous_ecn_bytes_incr = true; + } + } + + optlen -= TCPOLEN_ACCECN_PERFIELD; + ptr += TCPOLEN_ACCECN_PERFIELD; + } + if (ambiguous_ecn_bytes_incr) + tp->est_ecnfield = 0; + + return res; +} + static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; @@ -400,7 +468,8 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int flag) + u32 delivered_pkts, u32 delivered_bytes, + int flag) { const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); @@ -411,6 +480,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; + tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) @@ -436,12 +507,14 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, } static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int *flag) + u32 delivered_pkts, u32 delivered_bytes, + int *flag) { struct tcp_sock *tp = tcp_sk(sk); u32 delta; - delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + delta = __tcp_accecn_process(sk, skb, delivered_pkts, + delivered_bytes, *flag); if (delta > 0) { tcp_count_delivered_ce(tp, delta); *flag |= FLAG_ECE; @@ -3973,6 +4046,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); @@ -4012,6 +4086,7 @@ no_queue: if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ @@ -4139,6 +4214,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { @@ -4230,6 +4306,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + /* Save offset of AccECN option in TCP header */ + opt_rx->accecn = (ptr - 2) - (__u8 *)th; + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. @@ -4290,11 +4372,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { - if (tcp_parse_aligned_timestamp(tp, th)) + if (tcp_parse_aligned_timestamp(tp, th)) { + tp->rx_opt.accecn = 0; return true; + } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -6119,6 +6204,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6162f8dbe9d2..aa8dbfe20924 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3561,6 +3561,7 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; + net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5be2b3eb73d3..34e5c83bbace 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,6 +385,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) +#define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -406,6 +407,8 @@ struct tcp_out_options { u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 num_accecn_fields:7, /* number of AccECN fields needed */ + use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ @@ -603,6 +606,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp, return ptr; } +/* Initial values for AccECN option, ordered is based on ECN field bits + * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. + */ +static const u32 synack_ecn_bytes[3] = { 0, 0, 0 }; + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -621,6 +629,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts, struct tcp_key *key) { + u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */ + u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ @@ -656,15 +666,71 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } + if (OPTION_ACCECN & options) { + const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? + synack_ecn_bytes : + tp->received_ecn_bytes; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ce_idx = INET_ECN_CE - 1; + u32 e0b; + u32 e1b; + u32 ceb; + u8 len; + + e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; + e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; + ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; + len = TCPOLEN_ACCECN_BASE + + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; + + if (opts->num_accecn_fields == 2) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + } else if (opts->num_accecn_fields == 1) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + leftover_highbyte = e1b & 0xff; + leftover_lowbyte = TCPOPT_NOP; + } else if (opts->num_accecn_fields == 0) { + leftover_highbyte = TCPOPT_ACCECN1; + leftover_lowbyte = len; + } else if (opts->num_accecn_fields == 3) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + *ptr++ = htonl(((e0b & 0xffffff) << 8) | + TCPOPT_NOP); + } + if (tp) + tp->accecn_minlen = 0; + } + if (unlikely(OPTION_SACK_ADVERTISE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_WSCALE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | + u8 highbyte = TCPOPT_NOP; + + /* Do not split the leftover 2-byte to fit into a single + * NOP, i.e., replace this NOP only when 1 byte is leftover + * within leftover_highbyte. + */ + if (unlikely(leftover_highbyte != TCPOPT_NOP && + leftover_lowbyte == TCPOPT_NOP)) { + highbyte = leftover_highbyte; + leftover_highbyte = TCPOPT_NOP; + } + *ptr++ = htonl((highbyte << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); @@ -675,11 +741,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, tp->duplicate_sack : tp->selective_acks; int this_sack; - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { @@ -688,6 +756,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, } tp->rx_opt.dsack = 0; + } else if (unlikely(leftover_highbyte != TCPOPT_NOP || + leftover_lowbyte != TCPOPT_NOP)) { + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | + (TCPOPT_NOP << 8) | + TCPOPT_NOP); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { @@ -768,6 +844,61 @@ static void mptcp_set_option_cond(const struct request_sock *req, } } +static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) +{ + /* How much there's room for combining with the alignment padding? */ + if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == + OPTION_SACK_ADVERTISE) + return 2; + else if (opts->options & OPTION_WSCALE) + return 1; + return 0; +} + +/* Calculates how long AccECN option will fit to @remaining option space. + * + * AccECN option can sometimes replace NOPs used for alignment of other + * TCP options (up to @max_combine_saving available). + * + * Only solutions with at least @required AccECN fields are accepted. + * + * Returns: The size of the AccECN option excluding space repurposed from + * the alignment of the other options. + */ +static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, + int remaining) +{ + int size = TCP_ACCECN_MAXSIZE; + int max_combine_saving; + int align_size; + + if (opts->use_synack_ecn_bytes) + max_combine_saving = tcp_synack_options_combine_saving(opts); + else + max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0; + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + while (opts->num_accecn_fields >= required) { + /* Pad to dword if cannot combine */ + if ((size & 0x3) > max_combine_saving) + align_size = ALIGN(size, 4); + else + align_size = ALIGN_DOWN(size, 4); + + if (remaining >= align_size) { + size = align_size; + break; + } + + opts->num_accecn_fields--; + size -= TCPOLEN_ACCECN_PERFIELD; + } + if (opts->num_accecn_fields < required) + return 0; + + opts->options |= OPTION_ACCECN; + return size; +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -850,6 +981,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && + tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE)) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -867,6 +1007,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; + struct tcp_request_sock *treq = tcp_rsk(req); if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; @@ -929,6 +1070,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + if (treq->accecn_ok && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); @@ -1001,6 +1149,13 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } + if (tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; -- cgit v1.2.3 From aa55a7dde7ec506bb23448a5005ae3f4f809d022 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:31 +0200 Subject: tcp: accecn: AccECN option send control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of sending the option in every ACK, limit sending to those ACKs where the option is necessary: - Handshake - "Change-triggered ACK" + the ACK following it. The 2nd ACK is necessary to unambiguously indicate which of the ECN byte counters in increasing. The first ACK has two counters increasing due to the ecnfield edge. - ACKs with CE to allow CEP delta validations to take advantage of the option. - Force option to be sent every at least once per 2^22 bytes. The check is done using the bit edges of the byte counters (avoids need for extra variables). - AccECN option beacon to send a few times per RTT even if nothing in the ECN state requires that. The default is 3 times per RTT, and its period can be set via sysctl_tcp_ecn_option_beacon. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_tx is increased from 89 to 97 due to the new u64 accecn_opt_tstamp member: [BEFORE THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ struct list_head tsorted_sent_queue; /* 2496 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2521 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ u64 accecn_opt_tstamp; /* 2596 8 */ struct list_head tsorted_sent_queue; /* 2504 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2529 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2529 0 */ u8 nonagle:4; /* 2529: 0 1 */ u8 rate_app_limited:1; /* 2529: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2530: 0 1 */ u8 unused2:4; /* 2530: 4 1 */ u8 accecn_minlen:2; /* 2531: 0 1 */ u8 est_ecnfield:2; /* 2531: 2 1 */ u8 accecn_opt_demand:2; /* 2531: 4 1 */ u8 prev_ecnfield:2; /* 2531: 6 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2636 0 */ [...] /* size: 3200, cachelines: 50, members: 173 */ } Signed-off-by: Chia-Yu Chang Co-developed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-8-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 6 +++ .../networking/net_cachelines/tcp_sock.rst | 3 ++ include/linux/tcp.h | 4 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 ++ include/net/tcp_ecn.h | 52 ++++++++++++++++++++++ net/ipv4/sysctl_net_ipv4.c | 9 ++++ net/ipv4/tcp.c | 5 ++- net/ipv4/tcp_input.c | 4 +- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 2 + net/ipv4/tcp_output.c | 26 ++++++++--- 12 files changed, 107 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 1c206501b973..a06cb99d66dc 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -487,6 +487,12 @@ tcp_ecn_option - INTEGER Default: 2 +tcp_ecn_option_beacon - INTEGER + Control Accurate ECN (AccECN) option sending frequency per RTT and it + takes effect only when tcp_ecn_option is set to 2. + + Default: 3 (AccECN will be send at least 3 times per RTT) + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index b941151f8c0a..d4dc01800945 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -109,6 +109,9 @@ u8:2 syn_ect_snt write_mostly read_w u8:2 syn_ect_rcv read_mostly read_write u8:2 accecn_minlen write_mostly read_write u8:2 est_ecnfield read_write +u8:2 accecn_opt_demand read_mostly read_write +u8:2 prev_ecnfield read_write +u64 accecn_opt_tstamp read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 73557656cb2d..f637b659b35a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -275,6 +275,7 @@ struct tcp_sock { u32 mdev_us; /* medium deviation */ u32 rtt_seq; /* sequence number to update rttvar */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ + u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set @@ -296,7 +297,8 @@ struct tcp_sock { unused2:4; u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ est_ecnfield:2,/* ECN field for AccECN delivered estimates */ - unused3:4; + accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */ + prev_ecnfield:2; /* ECN bits from the previous segment */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index acbb7dd497e1..34eb3aecb3f2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -149,6 +149,7 @@ struct netns_ipv4 { u8 sysctl_tcp_ecn; u8 sysctl_tcp_ecn_option; + u8 sysctl_tcp_ecn_option_beacon; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6be29129465e..78dd7b8a4145 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -100,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U +/* Default sending frequency of accurate ECN option per RTT */ +#define TCP_ACCECN_OPTION_BEACON 3 + /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 08c7f4757e4e..133fb6b79500 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -176,6 +176,17 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } } +/* Demand the minimum # to send AccECN optnio */ +static inline void tcp_accecn_opt_demand_min(struct sock *sk, + u8 opt_demand_min) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 opt_demand; + + opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand); + tp->accecn_opt_demand = opt_demand; +} + /* Maps IP ECN field ECT/CE code point to AccECN option field number, given * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). */ @@ -256,6 +267,7 @@ static inline void tcp_ecn_received_counters(struct sock *sk, u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); struct tcp_sock *tp = tcp_sk(sk); + bool ecn_edge; if (!INET_ECN_is_not_ect(ecnfield)) { u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); @@ -274,9 +286,34 @@ static inline void tcp_ecn_received_counters(struct sock *sk, if (len > 0) { u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); + u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1]; + u32 bytes_mask = GENMASK_U32(31, 22); + tp->received_ecn_bytes[ecnfield - 1] += len; tp->accecn_minlen = max_t(u8, tp->accecn_minlen, minlen); + + /* Send AccECN option at least once per 2^22-byte + * increase in any ECN byte counter. + */ + if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) & + bytes_mask) { + tcp_accecn_opt_demand_min(sk, 1); + } + } + } + + ecn_edge = tp->prev_ecnfield != ecnfield; + if (ecn_edge || is_ce) { + tp->prev_ecnfield = ecnfield; + /* Demand Accurate ECN change-triggered ACKs. Two ACK are + * demanded to indicate unambiguously the ecnfield value + * in the latter ACK. + */ + if (tcp_ecn_mode_accecn(tp)) { + if (ecn_edge) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + tp->accecn_opt_demand = 2; } } } @@ -349,6 +386,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); tp->accecn_minlen = 0; + tp->accecn_opt_demand = 0; tp->est_ecnfield = 0; } @@ -431,6 +469,7 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + tp->accecn_opt_demand = 2; if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { @@ -451,6 +490,7 @@ static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, } else { tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + tp->prev_ecnfield = tp->syn_ect_rcv; tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); } } @@ -542,4 +582,16 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) th->ece = 1; } +static inline bool tcp_accecn_option_beacon_check(const struct sock *sk) +{ + u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon); + const struct tcp_sock *tp = tcp_sk(sk); + + if (!ecn_beacon) + return false; + + return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >= + (tp->srtt_us >> 3); +} + #endif /* _LINUX_TCP_ECN_H */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4a697acb4e85..24dbc603cc44 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -740,6 +740,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_ecn_option_beacon", + .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8c4a4b8666fc..090f9ac43d4c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3410,6 +3410,8 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); + tp->prev_ecnfield = 0; + tp->accecn_opt_tstamp = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5134,11 +5136,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e898a76c485e..87154fd86167 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6121,8 +6121,10 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { - if (tcp_ecn_mode_accecn(tp)) + if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; + tcp_accecn_opt_demand_min(sk, 1); + } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index aa8dbfe20924..6a63be1f6461 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3562,6 +3562,7 @@ static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; + net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1dbcc09ff7a9..193343494558 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,8 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 34e5c83bbace..f897c2594954 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -705,8 +705,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(((e0b & 0xffffff) << 8) | TCPOPT_NOP); } - if (tp) + if (tp) { tp->accecn_minlen = 0; + tp->accecn_opt_tstamp = tp->tcp_mstamp; + if (tp->accecn_opt_demand) + tp->accecn_opt_demand--; + } } if (unlikely(OPTION_SACK_ADVERTISE & options)) { @@ -1149,11 +1153,16 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } - if (tcp_ecn_mode_accecn(tp) && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { - opts->use_synack_ecn_bytes = 0; - size += tcp_options_fit_accecn(opts, tp->accecn_minlen, - MAX_TCP_OPTION_SPACE - size); + if (tcp_ecn_mode_accecn(tp)) { + int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); + + if (ecn_opt && + (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || + tcp_accecn_option_beacon_check(sk))) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, @@ -2863,6 +2872,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0; tcp_mstamp_refresh(tp); + + /* AccECN option beacon depends on mstamp, it may change mss */ + if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk)) + mss_now = tcp_current_mss(sk); + if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); -- cgit v1.2.3 From b40671b5ee588c8a61b2d0eacbad32ffc57e9a8f Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:32 +0200 Subject: tcp: accecn: AccECN option failure handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AccECN option may fail in various way, handle these: - Attempt to negotiate the use of AccECN on the 1st retransmitted SYN - From the 2nd retransmitted SYN, stop AccECN negotiation - Remove option from SYN/ACK rexmits to handle blackholes - If no option arrives in SYN/ACK, assume Option is not usable - If an option arrives later, re-enabled - If option is zeroed, disable AccECN option processing This patch use existing padding bits in tcp_request_sock and holes in tcp_sock without increasing the size. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-9-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 4 +++- include/net/tcp_ecn.h | 51 +++++++++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_input.c | 35 +++++++++++++++++++++++++++++++-- net/ipv4/tcp_minisocks.c | 14 +++++++++++++ net/ipv4/tcp_output.c | 11 ++++++++--- 7 files changed, 111 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f637b659b35a..3ca5ed02de6d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -173,6 +173,7 @@ struct tcp_request_sock { u8 syn_ect_snt: 2, syn_ect_rcv: 2, accecn_fail_mode:4; + u8 saw_accecn_opt :2; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -407,7 +408,8 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ - u8 accecn_fail_mode:4; /* AccECN failure handling */ + u8 accecn_fail_mode:4, /* AccECN failure handling */ + saw_accecn_opt:2; /* An AccECN option was seen */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 133fb6b79500..f13e5cd2b1ac 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -91,6 +91,11 @@ static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) tp->accecn_fail_mode |= mode; } +#define TCP_ACCECN_OPT_NOT_SEEN 0x0 +#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 +#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 +#define TCP_ACCECN_OPT_FAIL_SEEN 0x3 + static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; @@ -146,6 +151,14 @@ static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, return true; } +static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, + u8 saw_opt) +{ + tp->saw_accecn_opt = saw_opt; + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); +} + /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ static inline void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, u8 sent_ect) @@ -428,9 +441,35 @@ static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, } } +static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, + u8 opt_offset) +{ + u8 *ptr = skb_transport_header(skb) + opt_offset; + unsigned int optlen = ptr[1] - 2; + + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return TCP_ACCECN_OPT_FAIL_SEEN; + ptr += 2; + + /* Detect option zeroing: an AccECN connection "MAY check that the + * initial value of the EE0B field or the EE1B field is non-zero" + */ + if (optlen < TCPOLEN_ACCECN_PERFIELD) + return TCP_ACCECN_OPT_EMPTY_SEEN; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) + return TCP_ACCECN_OPT_COUNTER_SEEN; + ptr += TCPOLEN_ACCECN_PERFIELD * 2; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + + return TCP_ACCECN_OPT_COUNTER_SEEN; +} + /* See Table 2 of the AccECN draft */ -static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, - u8 ip_dsfield) +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, + const struct tcphdr *th, u8 ip_dsfield) { struct tcp_sock *tp = tcp_sk(sk); u8 ace = tcp_accecn_ace(th); @@ -469,7 +508,13 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; - tp->accecn_opt_demand = 2; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tp->accecn_opt_demand = 2; + } if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 53e0e85b52be..dce3113787a7 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -323,6 +323,8 @@ struct tcp_info { __u32 tcpi_received_e1_bytes; __u32 tcpi_received_e0_bytes; __u32 tcpi_received_ce_bytes; + __u16 tcpi_accecn_fail_mode; + __u16 tcpi_accecn_opt_seen; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 090f9ac43d4c..5b5c655ded1d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3409,6 +3409,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered = 0; tp->delivered_ce = 0; tp->accecn_fail_mode = 0; + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; @@ -4287,6 +4288,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; + info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 87154fd86167..5732f2d4329c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -398,7 +398,22 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, unsigned int i; u8 *ptr; + if (tcp_accecn_opt_fail_recv(tp)) + return false; + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (!tp->saw_accecn_opt) { + /* Too late to enable after this point due to + * potential counter wraps + */ + if (tp->bytes_sent >= (1 << 23) - 1) { + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + } + return false; + } + if (estimate_ecnfield) { u8 ecnfield = estimate_ecnfield - 1; @@ -415,6 +430,13 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, order1 = (ptr[0] == TCPOPT_ACCECN1); ptr += 2; + if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); + } + res = !!estimate_ecnfield; for (i = 0; i < 3; i++) { u32 init_offset; @@ -6123,7 +6145,13 @@ step1: if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; - tcp_accecn_opt_demand_min(sk, 1); + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tcp_accecn_opt_demand_min(sk, 1); + } } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && @@ -6606,7 +6634,8 @@ consume: */ if (tcp_ecn_mode_any(tp)) - tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); + tcp_ecn_rcv_synack(sk, skb, th, + TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -7177,6 +7206,8 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_rsk(req)->accecn_fail_mode = 0; tcp_rsk(req)->syn_ect_rcv = 0; tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 193343494558..327095ef95ef 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); @@ -678,6 +679,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool own_req; tmp_opt.saw_tstamp = 0; + tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); @@ -855,6 +857,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); + + tcp_rsk(req)->saw_accecn_opt = saw_opt; + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; + + tcp_rsk(req)->accecn_fail_mode |= fail_mode; + } + } + /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f897c2594954..65b90f73daa0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -985,9 +985,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } - /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + /* Simultaneous open SYN/ACK needs AccECN option but not SYN. + * It is attempted to negotiate the use of AccECN also on the first + * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" + * of AccECN draft. + */ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && tcp_ecn_mode_accecn(tp) && + inet_csk(sk)->icsk_retransmits < 2 && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && remaining >= TCPOLEN_ACCECN_BASE)) { opts->use_synack_ecn_bytes = 1; @@ -1076,7 +1081,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, if (treq->accecn_ok && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && - remaining >= TCPOLEN_ACCECN_BASE) { + req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } @@ -1156,7 +1161,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp)) { int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - if (ecn_opt && + if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; -- cgit v1.2.3 From fe2cddc648f0d7cdf7377e1cb5a8c7dc5547e290 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:33 +0200 Subject: tcp: accecn: AccECN option ceb/cep and ACE field multi-wrap heuristics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AccECN option ceb/cep heuristic algorithm is from AccECN spec Appendix A.2.2 to mitigate against false ACE field overflows. Armed with ceb delta from option, delivered bytes, and delivered packets it is possible to estimate how many times ACE field wrapped. This calculation is necessary only if more than one wrap is possible. Without SACK, delivered bytes and packets are not always trustworthy in which case TCP falls back to the simpler no-or-all wraps ceb algorithm. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-10-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 78dd7b8a4145..7c51a0a5ace8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -256,6 +256,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ TCPOLEN_ACCECN_PERFIELD * \ TCP_ACCECN_NUMFIELDS) +#define TCP_ACCECN_SAFETY_SHIFT 1 /* SAFETY_FACTOR in accecn draft */ /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5732f2d4329c..9fdc6ce25eb1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -493,16 +493,19 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delivered_pkts, u32 delivered_bytes, int flag) { + u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1]; const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); - u32 delta, safe_delta; + u32 delta, safe_delta, d_ceb; + bool opt_deltas_valid; u32 corrected_ace; /* Reordered ACK or uncertain due to lack of data to send and ts */ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; - tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + opt_deltas_valid = tcp_accecn_process_option(tp, skb, + delivered_bytes, flag); if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ @@ -525,6 +528,35 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, safe_delta = delivered_pkts - ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + if (opt_deltas_valid) { + d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb; + if (!d_ceb) + return delta; + + if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) && + (tcp_is_sack(tp) || + ((1 << inet_csk(sk)->icsk_ca_state) & + (TCPF_CA_Open | TCPF_CA_CWR)))) { + u32 est_d_cep; + + if (delivered_bytes <= d_ceb) + return safe_delta; + + est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb * + delivered_pkts, + delivered_bytes); + return min(safe_delta, + delta + + (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK)); + } + + if (d_ceb > delta * tp->mss_cache) + return safe_delta; + if (d_ceb < + safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) + return delta; + } + return safe_delta; } -- cgit v1.2.3 From 3fbb2a6f3a70c27a6a2be80d131970608c0f84d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:42 +0000 Subject: ipv6: make ipv6_pinfo.saddr_cache a boolean ipv6_pinfo.saddr_cache is either NULL or &np->saddr. We do not need 8 bytes, a boolean is enough. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-2-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 4 ++-- include/net/ip6_route.h | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/route.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index f43314517396..55c4d1e4dd7d 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -216,10 +216,10 @@ struct inet6_cork { struct ipv6_pinfo { struct in6_addr saddr; struct in6_pktinfo sticky_pktinfo; - const struct in6_addr *daddr_cache; #ifdef CONFIG_IPV6_SUBTREES - const struct in6_addr *saddr_cache; + bool saddr_cache; #endif + const struct in6_addr *daddr_cache; __be32 flow_label; __u32 frag_size; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 59f48ca3abdf..223c02d42688 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -230,7 +230,7 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, const struct in6_addr *daddr, - const struct in6_addr *saddr) + bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -238,7 +238,7 @@ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES - np->saddr_cache = saddr; + np->saddr_cache = saddr_set; #endif } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1992621e3f3f..c342f8daea7f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 333e43434dd7..1947ccdb00df 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); } return dst; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9d64c13bab5e..82ff6e1293d0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1102,7 +1102,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, */ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES - ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, + np->saddr_cache ? &np->saddr : NULL) || #endif (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { dst_release(dst); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3371f16b7a3e..e1b0aebf8bf9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3036,9 +3036,9 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, &sk->sk_v6_daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? - &np->saddr : + true : #endif - NULL); + false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f0a138f4220..ecc3a87cd8c4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -299,7 +299,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); icsk->icsk_ext_hdr_len = 0; if (opt) @@ -1459,7 +1459,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, false); newnp->saddr = ireq->ir_v6_loc_addr; -- cgit v1.2.3 From 5489f333ef993bfceebce9ae98944f04eaafcc30 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:43 +0000 Subject: ipv6: make ipv6_pinfo.daddr_cache a boolean ipv6_pinfo.daddr_cache is either NULL or &sk->sk_v6_daddr We do not need 8 bytes, a boolean is enough. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-3-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 2 +- include/net/ip6_route.h | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/route.c | 3 +-- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 55c4d1e4dd7d..8e6d9f8b3dc8 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -219,7 +219,7 @@ struct ipv6_pinfo { #ifdef CONFIG_IPV6_SUBTREES bool saddr_cache; #endif - const struct in6_addr *daddr_cache; + bool daddr_cache; __be32 flow_label; __u32 frag_size; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 223c02d42688..7c5512baa4b2 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -229,14 +229,14 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) * Store a destination cache entry in a socket */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, + bool daddr_set, bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); sk_setup_caps(sk, dst); - np->daddr_cache = daddr; + np->daddr_cache = daddr_set; #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr_set; #endif diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c342f8daea7f..1b0314644e0c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 1947ccdb00df..ea5cf3fdfdd6 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); } return dst; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 82ff6e1293d0..f904739e99b9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1100,7 +1100,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, * sockets. * 2. oif also should be the same. */ - if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, + np->daddr_cache ? &sk->sk_v6_daddr : NULL) || #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache ? &np->saddr : NULL) || diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e1b0aebf8bf9..aee6a10b112a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3032,8 +3032,7 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, #endif ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? true : diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ecc3a87cd8c4..c7271f6359db 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -299,7 +299,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); icsk->icsk_ext_hdr_len = 0; if (opt) @@ -1459,7 +1459,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, false); + ip6_dst_store(newsk, dst, false, false); newnp->saddr = ireq->ir_v6_loc_addr; -- cgit v1.2.3 From 4effb335b5dab08cb6e2c38d038910f8b527cfc9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:48 +0000 Subject: net: group sk_backlog and sk_receive_queue UDP receivers suffer from sk_rmem_alloc updates, currently sharing a cache line with fields that need to be read-mostly (sock_read_rx group): 1) RFS enabled hosts read sk_napi_id from __udpv6_queue_rcv_skb(). 2) sk->sk_rcvbuf is read from __udp_enqueue_schedule_skb() /* --- cacheline 3 boundary (192 bytes) --- */ struct { atomic_t rmem_alloc; /* 0xc0 0x4 */ // Oops int len; /* 0xc4 0x4 */ struct sk_buff * head; /* 0xc8 0x8 */ struct sk_buff * tail; /* 0xd0 0x8 */ } sk_backlog; /* 0xc0 0x18 */ __u8 __cacheline_group_end__sock_write_rx[0]; /* 0xd8 0 */ __u8 __cacheline_group_begin__sock_read_rx[0]; /* 0xd8 0 */ struct dst_entry * sk_rx_dst; /* 0xd8 0x8 */ int sk_rx_dst_ifindex;/* 0xe0 0x4 */ u32 sk_rx_dst_cookie; /* 0xe4 0x4 */ unsigned int sk_ll_usec; /* 0xe8 0x4 */ unsigned int sk_napi_id; /* 0xec 0x4 */ u16 sk_busy_poll_budget;/* 0xf0 0x2 */ u8 sk_prefer_busy_poll;/* 0xf2 0x1 */ u8 sk_userlocks; /* 0xf3 0x1 */ int sk_rcvbuf; /* 0xf4 0x4 */ struct sk_filter * sk_filter; /* 0xf8 0x8 */ Move sk_error (which is less often dirtied) there. Alternative would be to cache align sock_read_rx but this has more implications/risks. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-8-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 0fd465935334..867dc44140d4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -394,7 +394,6 @@ struct sock { atomic_t sk_drops; __s32 sk_peek_off; - struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -412,6 +411,7 @@ struct sock { } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc + struct sk_buff_head sk_error_queue; __cacheline_group_end(sock_write_rx); __cacheline_group_begin(sock_read_rx); -- cgit v1.2.3 From 9db27c80622bd612549ea213390500f7377ee3e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:49 +0000 Subject: udp: add udp_drops_inc() helper Generic sk_drops_inc() reads sk->sk_drop_counters. We know the precise location for UDP sockets. Move sk_drop_counters out of sock_read_rxtx so that sock_write_rxtx starts at a cache line boundary. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-9-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/net/sock.h | 2 +- include/net/udp.h | 5 +++++ net/core/sock.c | 1 - net/ipv4/udp.c | 12 ++++++------ net/ipv6/udp.c | 6 +++--- 5 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 867dc44140d4..82bcdb7d7e67 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -451,7 +451,6 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -568,6 +567,7 @@ struct sock { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif + struct numa_drop_counters *sk_drop_counters; struct rcu_head sk_rcu; netns_tracker ns_tracker; struct xarray sk_user_frags; diff --git a/include/net/udp.h b/include/net/udp.h index 93b159f30e88..a08822e294b0 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -295,6 +295,11 @@ static inline void udp_lib_init_sock(struct sock *sk) set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } +static inline void udp_drops_inc(struct sock *sk) +{ + numa_drop_add(&udp_sk(sk)->drop_counters, 1); +} + /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { diff --git a/net/core/sock.c b/net/core/sock.c index 1f8ef4d8bcd9..21742da19e45 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4444,7 +4444,6 @@ static int __init sock_struct_check(void) #ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); #endif - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 658ae8782799..25143f932447 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1790,7 +1790,7 @@ uncharge_drop: atomic_sub(skb->truesize, &sk->sk_rmem_alloc); drop: - sk_drops_inc(sk); + udp_drops_inc(sk); busylock_release(busy); return err; } @@ -1855,7 +1855,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); - sk_drops_inc(sk); + udp_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2011,7 +2011,7 @@ try_again: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } @@ -2081,7 +2081,7 @@ try_again: if (unlikely(err)) { if (!peeking) { - sk_drops_inc(sk); + udp_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2452,7 +2452,7 @@ csum_error: __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2537,7 +2537,7 @@ start_lookup: nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - sk_drops_inc(sk); + udp_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e87d0ef861f8..9f4d340d1e3a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -524,7 +524,7 @@ try_again: } if (unlikely(err)) { if (!peeking) { - sk_drops_inc(sk); + udp_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -908,7 +908,7 @@ csum_error: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -1013,7 +1013,7 @@ start_lookup: } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - sk_drops_inc(sk); + udp_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, -- cgit v1.2.3 From 3cd04c8f4afed71a48edef0db5255afc249c2feb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:50 +0000 Subject: udp: make busylock per socket While having all spinlocks packed into an array was a space saver, this also caused NUMA imbalance and hash collisions. UDPv6 socket size becomes 1600 after this patch. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-10-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/udp.h | 1 + include/net/udp.h | 1 + net/ipv4/udp.c | 20 ++------------------ 3 files changed, 4 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/linux/udp.h b/include/linux/udp.h index 6ed008ab1665..e554890c4415 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -109,6 +109,7 @@ struct udp_sock { */ struct hlist_node tunnel_list; struct numa_drop_counters drop_counters; + spinlock_t busylock ____cacheline_aligned_in_smp; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index a08822e294b0..eecd64097f91 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -289,6 +289,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); sk->sk_drop_counters = &up->drop_counters; + spin_lock_init(&up->busylock); skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 25143f932447..7d1444821ee5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1689,17 +1689,11 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line * trying to acquire the receive_queue spinlock. - * These busylock can be allocated on a per cpu manner, instead of a - * per socket one (that would consume a cache line per socket) */ -static int udp_busylocks_log __read_mostly; -static spinlock_t *udp_busylocks __read_mostly; - -static spinlock_t *busylock_acquire(void *ptr) +static spinlock_t *busylock_acquire(struct sock *sk) { - spinlock_t *busy; + spinlock_t *busy = &udp_sk(sk)->busylock; - busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); spin_lock(busy); return busy; } @@ -3997,7 +3991,6 @@ static void __init bpf_iter_register(void) void __init udp_init(void) { unsigned long limit; - unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; @@ -4006,15 +3999,6 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - /* 16 spinlocks per cpu */ - udp_busylocks_log = ilog2(nr_cpu_ids) + 4; - udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, - GFP_KERNEL); - if (!udp_busylocks) - panic("UDP: failed to alloc udp_busylocks\n"); - for (i = 0; i < (1U << udp_busylocks_log); i++) - spin_lock_init(udp_busylocks + i); - if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); -- cgit v1.2.3 From 00c94ca2b99e6610e483f92e531b319eeaed94aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:29 -0700 Subject: psp: base PSP device support Add a netlink family for PSP and allow drivers to register support. The "PSP device" is its own object. This allows us to perform more flexible reference counting / lifetime control than if PSP information was part of net_device. In the future we should also be able to "delegate" PSP access to software devices, such as *vlan, veth or netkit more easily. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-3-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 96 +++++++++++++++ include/linux/netdevice.h | 4 + include/net/psp.h | 12 ++ include/net/psp/functions.h | 14 +++ include/net/psp/types.h | 100 ++++++++++++++++ include/uapi/linux/psp.h | 42 +++++++ net/Kconfig | 1 + net/Makefile | 1 + net/psp/Kconfig | 13 ++ net/psp/Makefile | 5 + net/psp/psp-nl-gen.c | 65 ++++++++++ net/psp/psp-nl-gen.h | 30 +++++ net/psp/psp.h | 31 +++++ net/psp/psp_main.c | 139 ++++++++++++++++++++++ net/psp/psp_nl.c | 223 +++++++++++++++++++++++++++++++++++ tools/net/ynl/Makefile.deps | 1 + 16 files changed, 777 insertions(+) create mode 100644 Documentation/netlink/specs/psp.yaml create mode 100644 include/net/psp.h create mode 100644 include/net/psp/functions.h create mode 100644 include/net/psp/types.h create mode 100644 include/uapi/linux/psp.h create mode 100644 net/psp/Kconfig create mode 100644 net/psp/Makefile create mode 100644 net/psp/psp-nl-gen.c create mode 100644 net/psp/psp-nl-gen.h create mode 100644 net/psp/psp.h create mode 100644 net/psp/psp_main.c create mode 100644 net/psp/psp_nl.c (limited to 'include/net') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml new file mode 100644 index 000000000000..706f4baf8764 --- /dev/null +++ b/Documentation/netlink/specs/psp.yaml @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +--- +name: psp + +doc: + PSP Security Protocol Generic Netlink family. + +definitions: + - + type: enum + name: version + entries: [hdr0-aes-gcm-128, hdr0-aes-gcm-256, + hdr0-aes-gmac-128, hdr0-aes-gmac-256] + +attribute-sets: + - + name: dev + attributes: + - + name: id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: ifindex + doc: ifindex of the main netdevice linked to the PSP device. + type: u32 + - + name: psp-versions-cap + doc: Bitmask of PSP versions supported by the device. + type: u32 + enum: version + enum-as-flags: true + - + name: psp-versions-ena + doc: Bitmask of currently enabled (accepted on Rx) PSP versions. + type: u32 + enum: version + enum-as-flags: true + +operations: + list: + - + name: dev-get + doc: Get / dump information about PSP capable devices on the system. + attribute-set: dev + do: + request: + attributes: + - id + reply: &dev-all + attributes: + - id + - ifindex + - psp-versions-cap + - psp-versions-ena + pre: psp-device-get-locked + post: psp-device-unlock + dump: + reply: *dev-all + - + name: dev-add-ntf + doc: Notification about device appearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-del-ntf + doc: Notification about device disappearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-set + doc: Set the configuration of a PSP device. + attribute-set: dev + do: + request: + attributes: + - id + - psp-versions-ena + reply: + attributes: [] + pre: psp-device-get-locked + post: psp-device-unlock + - + name: dev-change-ntf + doc: Notification about device configuration being changed. + notify: dev-get + mcgrp: mgmt + +mcast-groups: + list: + - + name: mgmt + +... diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f5a840c07cf1..1c54d44805fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1906,6 +1906,7 @@ enum netdev_reg_state { * device struct * @mpls_ptr: mpls_dev struct pointer * @mctp_ptr: MCTP specific data + * @psp_dev: PSP crypto device registered for this netdev * * @dev_addr: Hw address (before bcast, * because most packets are unicast) @@ -2310,6 +2311,9 @@ struct net_device { #if IS_ENABLED(CONFIG_MCTP) struct mctp_dev __rcu *mctp_ptr; #endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_dev __rcu *psp_dev; +#endif /* * Cache lines mostly used on receive path (including eth_type_trans()) diff --git a/include/net/psp.h b/include/net/psp.h new file mode 100644 index 000000000000..33bb4d1dc46e --- /dev/null +++ b/include/net/psp.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_ALL_H +#define __NET_PSP_ALL_H + +#include +#include +#include + +/* Do not add any code here. Put it in the sub-headers instead. */ + +#endif /* __NET_PSP_ALL_H */ diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h new file mode 100644 index 000000000000..074f9df9afc3 --- /dev/null +++ b/include/net/psp/functions.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_HELPERS_H +#define __NET_PSP_HELPERS_H + +#include + +/* Driver-facing API */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, + struct psp_dev_caps *psd_caps, void *priv_ptr); +void psp_dev_unregister(struct psp_dev *psd); + +#endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h new file mode 100644 index 000000000000..d242b1ecee7d --- /dev/null +++ b/include/net/psp/types.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_H +#define __NET_PSP_H + +#include +#include + +struct netlink_ext_ack; + +#define PSP_DEFAULT_UDP_PORT 1000 + +struct psphdr { + u8 nexthdr; + u8 hdrlen; + u8 crypt_offset; + u8 verfl; + __be32 spi; + __be64 iv; + __be64 vc[]; /* optional */ +}; + +#define PSP_SPI_KEY_ID GENMASK(30, 0) +#define PSP_SPI_KEY_PHASE BIT(31) + +#define PSPHDR_CRYPT_OFFSET GENMASK(5, 0) + +#define PSPHDR_VERFL_SAMPLE BIT(7) +#define PSPHDR_VERFL_DROP BIT(6) +#define PSPHDR_VERFL_VERSION GENMASK(5, 2) +#define PSPHDR_VERFL_VIRT BIT(1) +#define PSPHDR_VERFL_ONE BIT(0) + +#define PSP_HDRLEN_NOOPT ((sizeof(struct psphdr) - 8) / 8) + +/** + * struct psp_dev_config - PSP device configuration + * @versions: PSP versions enabled on the device + */ +struct psp_dev_config { + u32 versions; +}; + +/** + * struct psp_dev - PSP device struct + * @main_netdev: original netdevice of this PSP device + * @ops: driver callbacks + * @caps: device capabilities + * @drv_priv: driver priv pointer + * @lock: instance lock, protects all fields + * @refcnt: reference count for the instance + * @id: instance id + * @config: current device configuration + * + * @rcu: RCU head for freeing the structure + */ +struct psp_dev { + struct net_device *main_netdev; + + struct psp_dev_ops *ops; + struct psp_dev_caps *caps; + void *drv_priv; + + struct mutex lock; + refcount_t refcnt; + + u32 id; + + struct psp_dev_config config; + + struct rcu_head rcu; +}; + +/** + * struct psp_dev_caps - PSP device capabilities + */ +struct psp_dev_caps { + /** + * @versions: mask of supported PSP versions + * Set this field to 0 to indicate PSP is not supported at all. + */ + u32 versions; +}; + +#define PSP_MAX_KEY 32 + +/** + * struct psp_dev_ops - netdev driver facing PSP callbacks + */ +struct psp_dev_ops { + /** + * @set_config: set configuration of a PSP device + * Driver can inspect @psd->config for the previous configuration. + * Core will update @psd->config with @config on success. + */ + int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, + struct netlink_ext_ack *extack); +}; + +#endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h new file mode 100644 index 000000000000..4a404f085190 --- /dev/null +++ b/include/uapi/linux/psp.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_PSP_H +#define _UAPI_LINUX_PSP_H + +#define PSP_FAMILY_NAME "psp" +#define PSP_FAMILY_VERSION 1 + +enum psp_version { + PSP_VERSION_HDR0_AES_GCM_128, + PSP_VERSION_HDR0_AES_GCM_256, + PSP_VERSION_HDR0_AES_GMAC_128, + PSP_VERSION_HDR0_AES_GMAC_256, +}; + +enum { + PSP_A_DEV_ID = 1, + PSP_A_DEV_IFINDEX, + PSP_A_DEV_PSP_VERSIONS_CAP, + PSP_A_DEV_PSP_VERSIONS_ENA, + + __PSP_A_DEV_MAX, + PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) +}; + +enum { + PSP_CMD_DEV_GET = 1, + PSP_CMD_DEV_ADD_NTF, + PSP_CMD_DEV_DEL_NTF, + PSP_CMD_DEV_SET, + PSP_CMD_DEV_CHANGE_NTF, + + __PSP_CMD_MAX, + PSP_CMD_MAX = (__PSP_CMD_MAX - 1) +}; + +#define PSP_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/Kconfig b/net/Kconfig index d5865cf19799..4b563aea4c23 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -82,6 +82,7 @@ config NET_CRC32C menu "Networking options" source "net/packet/Kconfig" +source "net/psp/Kconfig" source "net/unix/Kconfig" source "net/tls/Kconfig" source "net/xfrm/Kconfig" diff --git a/net/Makefile b/net/Makefile index aac960c41db6..90e3d72bf58b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_INET_PSP) += psp/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/psp/Kconfig b/net/psp/Kconfig new file mode 100644 index 000000000000..55f9dd87446b --- /dev/null +++ b/net/psp/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# PSP configuration +# +config INET_PSP + bool "PSP Security Protocol support" + depends on INET + help + Enable kernel support for the PSP protocol. + For more information see: + https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf + + If unsure, say N. diff --git a/net/psp/Makefile b/net/psp/Makefile new file mode 100644 index 000000000000..41b51d06e560 --- /dev/null +++ b/net/psp/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_INET_PSP) += psp.o + +psp-y := psp_main.o psp_nl.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c new file mode 100644 index 000000000000..859712e7c2c1 --- /dev/null +++ b/net/psp/psp-nl-gen.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "psp-nl-gen.h" + +#include + +/* PSP_CMD_DEV_GET - do */ +static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_DEV_SET - do */ +static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), +}; + +/* Ops table for psp */ +static const struct genl_split_ops psp_nl_ops[] = { + { + .cmd = PSP_CMD_DEV_GET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_get_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_get_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_DEV_GET, + .dumpit = psp_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = PSP_CMD_DEV_SET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_set_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_set_nl_policy, + .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group psp_nl_mcgrps[] = { + [PSP_NLGRP_MGMT] = { "mgmt", }, +}; + +struct genl_family psp_nl_family __ro_after_init = { + .name = PSP_FAMILY_NAME, + .version = PSP_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = psp_nl_ops, + .n_split_ops = ARRAY_SIZE(psp_nl_ops), + .mcgrps = psp_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(psp_nl_mcgrps), +}; diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h new file mode 100644 index 000000000000..a099686cab5d --- /dev/null +++ b/net/psp/psp-nl-gen.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_PSP_GEN_H +#define _LINUX_PSP_GEN_H + +#include +#include + +#include + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + PSP_NLGRP_MGMT, +}; + +extern struct genl_family psp_nl_family; + +#endif /* _LINUX_PSP_GEN_H */ diff --git a/net/psp/psp.h b/net/psp/psp.h new file mode 100644 index 000000000000..94d0cc31a61f --- /dev/null +++ b/net/psp/psp.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PSP_PSP_H +#define __PSP_PSP_H + +#include +#include +#include +#include +#include + +extern struct xarray psp_devs; +extern struct mutex psp_devs_lock; + +void psp_dev_destroy(struct psp_dev *psd); +int psp_dev_check_access(struct psp_dev *psd, struct net *net); + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); + +static inline void psp_dev_get(struct psp_dev *psd) +{ + refcount_inc(&psd->refcnt); +} + +static inline void psp_dev_put(struct psp_dev *psd) +{ + if (refcount_dec_and_test(&psd->refcnt)) + psp_dev_destroy(psd); +} + +#endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c new file mode 100644 index 000000000000..e09499b7b14a --- /dev/null +++ b/net/psp/psp_main.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp.h" +#include "psp-nl-gen.h" + +DEFINE_XARRAY_ALLOC1(psp_devs); +struct mutex psp_devs_lock; + +/** + * DOC: PSP locking + * + * psp_devs_lock protects the psp_devs xarray. + * Ordering is take the psp_devs_lock and then the instance lock. + * Each instance is protected by RCU, and has a refcount. + * When driver unregisters the instance gets flushed, but struct sticks around. + */ + +/** + * psp_dev_check_access() - check if user in a given net ns can access PSP dev + * @psd: PSP device structure user is trying to access + * @net: net namespace user is in + * + * Return: 0 if PSP device should be visible in @net, errno otherwise. + */ +int psp_dev_check_access(struct psp_dev *psd, struct net *net) +{ + if (dev_net(psd->main_netdev) == net) + return 0; + return -ENOENT; +} + +/** + * psp_dev_create() - create and register PSP device + * @netdev: main netdevice + * @psd_ops: driver callbacks + * @psd_caps: device capabilities + * @priv_ptr: back-pointer to driver private data + * + * Return: pointer to allocated PSP device, or ERR_PTR. + */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, + struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, + void *priv_ptr) +{ + struct psp_dev *psd; + static u32 last_id; + int err; + + if (WARN_ON(!psd_caps->versions || + !psd_ops->set_config)) + return ERR_PTR(-EINVAL); + + psd = kzalloc(sizeof(*psd), GFP_KERNEL); + if (!psd) + return ERR_PTR(-ENOMEM); + + psd->main_netdev = netdev; + psd->ops = psd_ops; + psd->caps = psd_caps; + psd->drv_priv = priv_ptr; + + mutex_init(&psd->lock); + refcount_set(&psd->refcnt, 1); + + mutex_lock(&psp_devs_lock); + err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b, + &last_id, GFP_KERNEL); + if (err) { + mutex_unlock(&psp_devs_lock); + kfree(psd); + return ERR_PTR(err); + } + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF); + + rcu_assign_pointer(netdev->psp_dev, psd); + + mutex_unlock(&psd->lock); + + return psd; +} +EXPORT_SYMBOL(psp_dev_create); + +void psp_dev_destroy(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + xa_erase(&psp_devs, psd->id); + mutex_unlock(&psp_devs_lock); + + mutex_destroy(&psd->lock); + kfree_rcu(psd, rcu); +} + +/** + * psp_dev_unregister() - unregister PSP device + * @psd: PSP device structure + */ +void psp_dev_unregister(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + mutex_lock(&psd->lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); + + /* Wait until psp_dev_destroy() to call xa_erase() to prevent a + * different psd from being added to the xarray with this id, while + * there are still references to this psd being held. + */ + xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); + mutex_unlock(&psp_devs_lock); + + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); + + psd->ops = NULL; + psd->drv_priv = NULL; + + mutex_unlock(&psd->lock); + + psp_dev_put(psd); +} +EXPORT_SYMBOL(psp_dev_unregister); + +static int __init psp_init(void) +{ + mutex_init(&psp_devs_lock); + + return genl_register_family(&psp_nl_family); +} + +subsys_initcall(psp_init); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c new file mode 100644 index 000000000000..fda5ce800f82 --- /dev/null +++ b/net/psp/psp_nl.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp-nl-gen.h" +#include "psp.h" + +/* Netlink helpers */ + +static struct sk_buff *psp_nl_reply_new(struct genl_info *info) +{ + struct sk_buff *rsp; + void *hdr; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return NULL; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + nlmsg_free(rsp); + return NULL; + } + + return rsp; +} + +static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info) +{ + /* Note that this *only* works with a single message per skb! */ + nlmsg_end(rsp, (struct nlmsghdr *)rsp->data); + + return genlmsg_reply(rsp, info); +} + +/* Device stuff */ + +static struct psp_dev * +psp_device_get_and_lock(struct net *net, struct nlattr *dev_id) +{ + struct psp_dev *psd; + int err; + + mutex_lock(&psp_devs_lock); + psd = xa_load(&psp_devs, nla_get_u32(dev_id)); + if (!psd) { + mutex_unlock(&psp_devs_lock); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + err = psp_dev_check_access(psd, net); + if (err) { + mutex_unlock(&psd->lock); + return ERR_PTR(err); + } + + return psd; +} + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID)) + return -EINVAL; + + info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info), + info->attrs[PSP_A_DEV_ID]); + return PTR_ERR_OR_ZERO(info->user_ptr[0]); +} + +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + + mutex_unlock(&psd->lock); +} + +static int +psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev), + PSP_NLGRP_MGMT)) + return; + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + genl_info_init_ntf(&info, &psp_nl_family, cmd); + if (psp_nl_dev_fill(psd, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_MGMT, GFP_KERNEL); +} + +int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_dev_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_dev_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} + +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct psp_dev_config new_config; + struct sk_buff *rsp; + int err; + + memcpy(&new_config, &psd->config, sizeof(new_config)); + + if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) { + new_config.versions = + nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]); + if (new_config.versions & ~psd->caps->versions) { + NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(info->extack, "No settings present"); + return -EINVAL; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + if (memcmp(&new_config, &psd->config, sizeof(new_config))) { + err = psd->ops->set_config(psd, &new_config, info->extack); + if (err) + goto err_free_rsp; + + memcpy(&psd->config, &new_config, sizeof(new_config)); + } + + psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF); + + return psp_nl_reply_send(rsp, info); + +err_free_rsp: + nlmsg_free(rsp); + return err; +} diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 90686e241157..865fd2e8519e 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -31,6 +31,7 @@ CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN_H,ovpn.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) +CFLAGS_psp:=$(call get_hdr_inc,_LINUX_PSP_H,psp.h) CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ $(call get_hdr_inc,__LINUX_IF_ADDR_H,if_addr.h) CFLAGS_rt-link:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ -- cgit v1.2.3 From ed8a507b748336902525aa79e3573552534e8b3e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:30 -0700 Subject: net: modify core data structures for PSP datapath support Add pointers to psp data structures to core networking structs, and an SKB extension to carry the PSP information from the drivers to the socket layer. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-4-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 3 +++ include/net/inet_timewait_sock.h | 3 +++ include/net/psp/functions.h | 6 ++++++ include/net/psp/types.h | 7 +++++++ include/net/sock.h | 4 ++++ net/core/skbuff.c | 4 ++++ net/ipv4/af_inet.c | 2 ++ net/ipv4/tcp_minisocks.c | 2 ++ 8 files changed, 31 insertions(+) (limited to 'include/net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 62e7addccdf6..78ecfa7d00d0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4901,6 +4901,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) SKB_EXT_MCTP, +#endif +#if IS_ENABLED(CONFIG_INET_PSP) + SKB_EXT_PSP, #endif SKB_EXT_NUM, /* must be last */ }; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 67a313575780..c1295246216c 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -81,6 +81,9 @@ struct inet_timewait_sock { struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; +#endif }; #define tw_tclass tw_tos diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 074f9df9afc3..d0043bd14299 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -5,10 +5,16 @@ #include +struct inet_timewait_sock; + /* Driver-facing API */ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr); void psp_dev_unregister(struct psp_dev *psd); +/* Kernel-facing API */ +static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } + #endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h index d242b1ecee7d..4922fc8d42fd 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -84,6 +84,13 @@ struct psp_dev_caps { #define PSP_MAX_KEY 32 +struct psp_skb_ext { + __be32 spi; + u16 dev_id; + u8 generation; + u8 version; +}; + /** * struct psp_dev_ops - netdev driver facing PSP callbacks */ diff --git a/include/net/sock.h b/include/net/sock.h index 0fd465935334..d1d3d36e39ae 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -249,6 +249,7 @@ struct sk_filter; * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy + * @psp_assoc: PSP association, if socket is PSP-secured * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -450,6 +451,9 @@ struct sock { #endif #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; +#endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; #endif struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 23b776cd9879..d331e607edfb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -5062,6 +5063,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif +#if IS_ENABLED(CONFIG_INET_PSP) + [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 76e38092cd8a..e298dacb4a06 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #include @@ -158,6 +159,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); + psp_sk_assoc_free(sk); } EXPORT_SYMBOL(inet_sock_destruct); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 327095ef95ef..ddb67015ba28 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -24,6 +24,7 @@ #include #include #include +#include static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -392,6 +393,7 @@ void tcp_twsk_destructor(struct sock *sk) } #endif tcp_ao_destroy_sock(sk, true); + psp_twsk_assoc_free(inet_twsk(sk)); } void tcp_twsk_purge(struct list_head *net_exit_list) -- cgit v1.2.3 From 659a2899a57da59f433182eba571881884d6323e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:31 -0700 Subject: tcp: add datapath logic for PSP with inline key exchange Add validation points and state propagation to support PSP key exchange inline, on TCP connections. The expectation is that application will use some well established mechanism like TLS handshake to establish a secure channel over the connection and if both endpoints are PSP-capable - exchange and install PSP keys. Because the connection can existing in PSP-unsecured and PSP-secured state we need to make sure that there are no race conditions or retransmission leaks. On Tx - mark packets with the skb->decrypted bit when PSP key is at the enqueue time. Drivers should only encrypt packets with this bit set. This prevents retransmissions getting encrypted when original transmission was not. Similarly to TLS, we'll use sk->sk_validate_xmit_skb to make sure PSP skbs can't "escape" via a PSP-unaware device without being encrypted. On Rx - validation is done under socket lock. This moves the validation point later than xfrm, for example. Please see the documentation patch for more details on the flow of securing a connection, but for the purpose of this patch what's important is that we want to enforce the invariant that once connection is secured any skb in the receive queue has been encrypted with PSP. Add GRO and coalescing checks to prevent PSP authenticated data from being combined with cleartext data, or data with non-matching PSP state. On Rx, check skb's with psp_skb_coalesce_diff() at points before psp_sk_rx_policy_check(). After skb's are policy checked and on the socket receive queue, skb_cmp_decrypted() is sufficient for checking for coalescable PSP state. On Tx, tcp_write_collapse_fence() should be called when transitioning a socket into PSP Tx state to prevent data sent as cleartext from being coalesced with PSP encapsulated data. This change only adds the validation points, for ease of review. Subsequent change will add the ability to install keys, and flesh the enforcement logic out Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-5-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 6 ++++ include/net/psp/functions.h | 77 +++++++++++++++++++++++++++++++++++++++++++ net/core/gro.c | 2 ++ net/ipv4/inet_timewait_sock.c | 2 ++ net/ipv4/ip_output.c | 5 ++- net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_ipv4.c | 14 +++++++- net/ipv4/tcp_minisocks.c | 18 ++++++++++ net/ipv4/tcp_output.c | 17 ++++++---- net/ipv6/tcp_ipv6.c | 11 +++++++ net/psp/Kconfig | 1 + 11 files changed, 147 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index d8ff24a33459..58d91ccc56e0 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -127,6 +127,8 @@ FN(CANXL_RX_INVALID_FRAME) \ FN(PFMEMALLOC) \ FN(DUALPI2_STEP_DROP) \ + FN(PSP_INPUT) \ + FN(PSP_OUTPUT) \ FNe(MAX) /** @@ -610,6 +612,10 @@ enum skb_drop_reason { * threshold of DualPI2 qdisc. */ SKB_DROP_REASON_DUALPI2_STEP_DROP, + /** @SKB_DROP_REASON_PSP_INPUT: PSP input checks failed */ + SKB_DROP_REASON_PSP_INPUT, + /** @SKB_DROP_REASON_PSP_OUTPUT: PSP output checks failed */ + SKB_DROP_REASON_PSP_OUTPUT, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index d0043bd14299..1ccc5fc238b8 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -3,6 +3,8 @@ #ifndef __NET_PSP_HELPERS_H #define __NET_PSP_HELPERS_H +#include +#include #include struct inet_timewait_sock; @@ -14,7 +16,82 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); /* Kernel-facing API */ +#if IS_ENABLED(CONFIG_INET_PSP) static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void +psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } +static inline void +psp_reply_set_decrypted(struct sk_buff *skb) { } + +static inline void +psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) +{ +} + +static inline unsigned long +__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, + unsigned long diffs) +{ + return diffs; +} + +static inline enum skb_drop_reason +psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +static inline enum skb_drop_reason +psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) +{ + return 0; +} + +static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) +{ + return NULL; +} +#else +static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void +psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } +static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } +static inline void +psp_reply_set_decrypted(struct sk_buff *skb) { } + +static inline void +psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } + +static inline unsigned long +__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, + unsigned long diffs) +{ + return diffs; +} + +static inline enum skb_drop_reason +psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +static inline enum skb_drop_reason +psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) +{ + return 0; +} + +static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) +{ + return NULL; +} +#endif + +static inline unsigned long +psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two) +{ + return __psp_skb_coalesce_diff(one, two, 0); +} #endif /* __NET_PSP_HELPERS_H */ diff --git a/net/core/gro.c b/net/core/gro.c index b350e5b69549..5ba4504cfd28 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include #include @@ -376,6 +377,7 @@ static void gro_list_prepare(const struct list_head *head, diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); diffs |= gro_list_prepare_tc_ext(skb, p, diffs); + diffs |= __psp_skb_coalesce_diff(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5b5426b8ee92..1f83f333b8ac 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -16,6 +16,7 @@ #include #include #include +#include /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash @@ -219,6 +220,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, refcount_set(&tw->tw_refcnt, 0); __module_get(tw->tw_prot->owner); + psp_twsk_init(tw, sk); } return tw; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2b96651d719b..5ca97ede979c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -84,6 +84,7 @@ #include #include #include +#include static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -1665,8 +1666,10 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - if (orig_sk) + if (orig_sk) { skb_set_owner_edemux(nskb, (struct sock *)orig_sk); + psp_reply_set_decrypted(nskb); + } if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5b5c655ded1d..d6d0d970e014 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -277,6 +277,7 @@ #include #include #include +#include #include #include @@ -705,6 +706,7 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6a63be1f6461..f27f6f865a48 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -1907,6 +1908,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) enum skb_drop_reason reason; struct sock *rsk; + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; @@ -1968,6 +1973,7 @@ csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -2069,7 +2075,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || - memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) + memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || + /* prior to PSP Rx policy check, retain exact PSP metadata */ + psp_skb_coalesce_diff(tail, skb)) goto no_coalesce; __skb_pull(skb, hdrlen); @@ -2437,6 +2445,10 @@ do_time_wait: __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ddb67015ba28..2ec8c6f1cdcc 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -105,9 +105,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); struct tcp_options_received tmp_opt; + enum skb_drop_reason psp_drop; bool paws_reject = false; int ts_recent_stamp; + /* Instead of dropping immediately, wait to see what value is + * returned. We will accept a non psp-encapsulated syn in the + * case where TCP_TW_SYN is returned. + */ + psp_drop = psp_twsk_rx_policy_check(tw, skb); + tmp_opt.saw_tstamp = 0; ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { @@ -125,6 +132,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ + if (psp_drop) + goto out_put; + /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, @@ -195,6 +205,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ + if (psp_drop) + goto out_put; + if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this @@ -248,6 +261,9 @@ kill: return TCP_TW_SYN; } + if (psp_drop) + goto out_put; + if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); @@ -266,6 +282,8 @@ kill: return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } + +out_put: inet_twsk_put(tw); return TCP_TW_SUCCESS; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 388c45859469..223d7feeb19d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -358,13 +359,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags) +static void tcp_init_nondata_skb(struct sk_buff *skb, struct sock *sk, + u32 seq, u16 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); + psp_enqueue_set_decrypted(sk, skb); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -1656,6 +1659,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); @@ -3778,7 +3782,7 @@ void tcp_send_fin(struct sock *sk) skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ - tcp_init_nondata_skb(skb, tp->write_seq, + tcp_init_nondata_skb(skb, sk, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } @@ -3806,7 +3810,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority, /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + tcp_init_nondata_skb(skb, sk, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ @@ -4303,7 +4307,7 @@ int tcp_connect(struct sock *sk) /* SYN eats a sequence byte, write_seq updated by * tcp_connect_queue_skb(). */ - tcp_init_nondata_skb(buff, tp->write_seq, TCPHDR_SYN); + tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); @@ -4428,7 +4432,8 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags) /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags); + tcp_init_nondata_skb(buff, sk, + tcp_acceptable_seq(sk), TCPHDR_ACK | flags); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. @@ -4474,7 +4479,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) * end to send an ack. Don't queue or clone SKB, just * send it. */ - tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); + tcp_init_nondata_skb(skb, sk, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f0a138f4220..4da8eb9183d7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -973,6 +974,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); + psp_reply_set_decrypted(buff); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; @@ -1605,6 +1607,10 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. @@ -1684,6 +1690,7 @@ csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1988,6 +1995,10 @@ do_time_wait: __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; diff --git a/net/psp/Kconfig b/net/psp/Kconfig index 55f9dd87446b..5e3908a40945 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -5,6 +5,7 @@ config INET_PSP bool "PSP Security Protocol support" depends on INET + select SKB_DECRYPTED help Enable kernel support for the PSP protocol. For more information see: -- cgit v1.2.3 From 117f02a49b7719b210d154a0d0e728001bf4af06 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:32 -0700 Subject: psp: add op for rotation of device key Rotating the device key is a key part of the PSP protocol design. Some external daemon needs to do it once a day, or so. Add a netlink op to perform this operation. Add a notification group for informing users that key has been rotated and they should rekey (next rotation will cut them off). Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-6-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 21 +++++++++++++++++++ include/net/psp/types.h | 5 +++++ include/uapi/linux/psp.h | 3 +++ net/psp/psp-nl-gen.c | 15 ++++++++++++++ net/psp/psp-nl-gen.h | 2 ++ net/psp/psp_main.c | 3 ++- net/psp/psp_nl.c | 40 ++++++++++++++++++++++++++++++++++++ 7 files changed, 88 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 706f4baf8764..054cc02b65ad 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -88,9 +88,30 @@ operations: notify: dev-get mcgrp: mgmt + - + name: key-rotate + doc: Rotate the device key. + attribute-set: dev + do: + request: + attributes: + - id + reply: + attributes: + - id + pre: psp-device-get-locked + post: psp-device-unlock + - + name: key-rotate-ntf + doc: Notification about device key getting rotated. + notify: key-rotate + mcgrp: use + mcast-groups: list: - name: mgmt + - + name: use ... diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 4922fc8d42fd..66327fa80c92 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -102,6 +102,11 @@ struct psp_dev_ops { */ int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, struct netlink_ext_ack *extack); + + /** + * @key_rotate: rotate the device key + */ + int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index 4a404f085190..cbfbf3f0f364 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -32,11 +32,14 @@ enum { PSP_CMD_DEV_DEL_NTF, PSP_CMD_DEV_SET, PSP_CMD_DEV_CHANGE_NTF, + PSP_CMD_KEY_ROTATE, + PSP_CMD_KEY_ROTATE_NTF, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) }; #define PSP_MCGRP_MGMT "mgmt" +#define PSP_MCGRP_USE "use" #endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 859712e7c2c1..7f49577ac72f 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -21,6 +21,11 @@ static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), }; +/* PSP_CMD_KEY_ROTATE - do */ +static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -46,10 +51,20 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_KEY_ROTATE, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_key_rotate_doit, + .post_doit = psp_device_unlock, + .policy = psp_key_rotate_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { [PSP_NLGRP_MGMT] = { "mgmt", }, + [PSP_NLGRP_USE] = { "use", }, }; struct genl_family psp_nl_family __ro_after_init = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index a099686cab5d..00a2d4ec59e4 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -20,9 +20,11 @@ psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); enum { PSP_NLGRP_MGMT, + PSP_NLGRP_USE, }; extern struct genl_family psp_nl_family; diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index e09499b7b14a..f60155493afc 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -54,7 +54,8 @@ psp_dev_create(struct net_device *netdev, int err; if (WARN_ON(!psd_caps->versions || - !psd_ops->set_config)) + !psd_ops->set_config || + !psd_ops->key_rotate)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index fda5ce800f82..75f2702c1029 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -221,3 +221,43 @@ err_free_rsp: nlmsg_free(rsp); return err; } + +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct genl_info ntf_info; + struct sk_buff *ntf, *rsp; + int err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + genl_info_init_ntf(&ntf_info, &psp_nl_family, PSP_CMD_KEY_ROTATE_NTF); + ntf = psp_nl_reply_new(&ntf_info); + if (!ntf) { + err = -ENOMEM; + goto err_free_rsp; + } + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(ntf, PSP_A_DEV_ID, psd->id)) { + err = -EMSGSIZE; + goto err_free_ntf; + } + + err = psd->ops->key_rotate(psd, info->extack); + if (err) + goto err_free_ntf; + + nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_USE, GFP_KERNEL); + return psp_nl_reply_send(rsp, info); + +err_free_ntf: + nlmsg_free(ntf); +err_free_rsp: + nlmsg_free(rsp); + return err; +} -- cgit v1.2.3 From 8c511c1df380780b8a81050767dbfe7ca518d3a2 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 16 Sep 2025 17:09:33 -0700 Subject: net: move sk_validate_xmit_skb() to net/core/dev.c Move definition of sk_validate_xmit_skb() from net/core/sock.c to net/core/dev.c. This change is in preparation of the next patch, where sk_validate_xmit_skb() will need to cast sk to a tcp_timewait_sock *, and access member fields. Including linux/tcp.h from linux/sock.h creates a circular dependency, and dev.c is the only current call site of this function. Reviewed-by: Willem de Bruijn Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-7-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 22 ---------------------- net/core/dev.c | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index d1d3d36e39ae..bf92029a88d6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2960,28 +2960,6 @@ sk_requests_wifi_status(struct sock *sk) return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); } -/* Checks if this SKB belongs to an HW offloaded socket - * and whether any SW fallbacks are required based on dev. - * Check decrypted mark in case skb_orphan() cleared socket. - */ -static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, - struct net_device *dev) -{ -#ifdef CONFIG_SOCK_VALIDATE_XMIT - struct sock *sk = skb->sk; - - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { - skb = sk->sk_validate_xmit_skb(sk, dev, skb); - } else if (unlikely(skb_is_decrypted(skb))) { - pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); - kfree_skb(skb); - skb = NULL; - } -#endif - - return skb; -} - /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ diff --git a/net/core/dev.c b/net/core/dev.c index 2522d9d8f0e4..384e59d7e715 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3907,6 +3907,28 @@ sw_checksum: } EXPORT_SYMBOL(skb_csum_hwoffload_help); +/* Checks if this SKB belongs to an HW offloaded socket + * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. + */ +static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, + struct net_device *dev) +{ +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sock *sk = skb->sk; + + if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { + skb = sk->sk_validate_xmit_skb(sk, dev, skb); + } else if (unlikely(skb_is_decrypted(skb))) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; + } +#endif + + return skb; +} + static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, struct net_device *dev) { -- cgit v1.2.3 From 0917bb139eed467a6376db903ad7a67981ec1420 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 16 Sep 2025 17:09:34 -0700 Subject: net: tcp: allow tcp_timewait_sock to validate skbs before handing to device Provide a callback to validate skb's originating from tcp timewait socks before passing to the device layer. Full socks have a sk_validate_xmit_skb member for checking that a device is capable of performing offloads required for transmitting an skb. With psp, tcp timewait socks will inherit the crypto state from their corresponding full socks. Any ACKs or RSTs that originate from a tcp timewait sock carrying psp state should be psp encapsulated. Reviewed-by: Willem de Bruijn Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-8-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/inet_timewait_sock.h | 5 +++++ net/core/dev.c | 14 ++++++++++++-- net/ipv4/inet_timewait_sock.c | 3 +++ 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index c1295246216c..3a31c74c9e15 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -84,6 +84,11 @@ struct inet_timewait_sock { #if IS_ENABLED(CONFIG_INET_PSP) struct psp_assoc __rcu *psp_assoc; #endif +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff* (*tw_validate_xmit_skb)(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); +#endif }; #define tw_tclass tw_tos diff --git a/net/core/dev.c b/net/core/dev.c index 384e59d7e715..5e22d062bac5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3915,10 +3915,20 @@ static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { #ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); struct sock *sk = skb->sk; - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { - skb = sk->sk_validate_xmit_skb(sk, dev, skb); + sk_validate = NULL; + if (sk) { + if (sk_fullsock(sk)) + sk_validate = sk->sk_validate_xmit_skb; + else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT) + sk_validate = inet_twsk(sk)->tw_validate_xmit_skb; + } + + if (sk_validate) { + skb = sk_validate(sk, dev, skb); } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1f83f333b8ac..2ca2912f61f4 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -212,6 +212,9 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); timer_setup(&tw->tw_timer, tw_timer_handler, 0); +#ifdef CONFIG_SOCK_VALIDATE_XMIT + tw->tw_validate_xmit_skb = NULL; +#endif /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this -- cgit v1.2.3 From 6b46ca260e2290e3453d1355ab5b6d283d73d780 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:35 -0700 Subject: net: psp: add socket security association code Add the ability to install PSP Rx and Tx crypto keys on TCP connections. Netlink ops are provided for both operations. Rx side combines allocating a new Rx key and installing it on the socket. Theoretically these are separate actions, but in practice they will always be used one after the other. We can add distinct "alloc" and "install" ops later. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-9-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 70 +++++++++ include/net/psp/functions.h | 114 +++++++++++++-- include/net/psp/types.h | 57 ++++++++ include/uapi/linux/psp.h | 21 +++ net/psp/Kconfig | 1 + net/psp/Makefile | 2 +- net/psp/psp-nl-gen.c | 39 +++++ net/psp/psp-nl-gen.h | 7 + net/psp/psp.h | 22 +++ net/psp/psp_main.c | 26 +++- net/psp/psp_nl.c | 232 +++++++++++++++++++++++++++++ net/psp/psp_sock.c | 274 +++++++++++++++++++++++++++++++++++ 12 files changed, 854 insertions(+), 11 deletions(-) create mode 100644 net/psp/psp_sock.c (limited to 'include/net') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 054cc02b65ad..944429e5c9a8 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -38,6 +38,44 @@ attribute-sets: type: u32 enum: version enum-as-flags: true + - + name: assoc + attributes: + - + name: dev-id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: version + doc: | + PSP versions (AEAD and protocol version) used by this association, + dictates the size of the key. + type: u32 + enum: version + - + name: rx-key + type: nest + nested-attributes: keys + - + name: tx-key + type: nest + nested-attributes: keys + - + name: sock-fd + doc: Sockets which should be bound to the association immediately. + type: u32 + - + name: keys + attributes: + - + name: key + type: binary + - + name: spi + doc: Security Parameters Index (SPI) of the association. + type: u32 operations: list: @@ -107,6 +145,38 @@ operations: notify: key-rotate mcgrp: use + - + name: rx-assoc + doc: Allocate a new Rx key + SPI pair, associate it with a socket. + attribute-set: assoc + do: + request: + attributes: + - dev-id + - version + - sock-fd + reply: + attributes: + - dev-id + - rx-key + pre: psp-assoc-device-get-locked + post: psp-device-unlock + - + name: tx-assoc + doc: Add a PSP Tx association. + attribute-set: assoc + do: + request: + attributes: + - dev-id + - version + - tx-key + - sock-fd + reply: + attributes: [] + pre: psp-assoc-device-get-locked + post: psp-device-unlock + mcast-groups: list: - diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 1ccc5fc238b8..0d7141230f47 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -4,7 +4,9 @@ #define __NET_PSP_HELPERS_H #include +#include #include +#include #include struct inet_timewait_sock; @@ -16,41 +18,130 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); /* Kernel-facing API */ +void psp_assoc_put(struct psp_assoc *pas); + +static inline void *psp_assoc_drv_data(struct psp_assoc *pas) +{ + return pas->drv_data; +} + #if IS_ENABLED(CONFIG_INET_PSP) -static inline void psp_sk_assoc_free(struct sock *sk) { } -static inline void -psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } -static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } -static inline void -psp_reply_set_decrypted(struct sk_buff *skb) { } +unsigned int psp_key_size(u32 version); +void psp_sk_assoc_free(struct sock *sk); +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); +void psp_twsk_assoc_free(struct inet_timewait_sock *tw); +void psp_reply_set_decrypted(struct sk_buff *skb); + +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk)); +} static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { + struct psp_assoc *pas; + + pas = psp_sk_assoc(sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { + struct psp_skb_ext *a, *b; + + a = skb_ext_find(one, SKB_EXT_PSP); + b = skb_ext_find(two, SKB_EXT_PSP); + + diffs |= (!!a) ^ (!!b); + if (!diffs && unlikely(a)) + diffs |= memcmp(a, b, sizeof(*a)); return diffs; } +static inline bool +psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas) +{ + bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + u32 seq = TCP_SKB_CB(skb)->seq; + bool pure_fin; + + pure_fin = fin && end_seq - seq == 1; + + return seq == end_seq || (pure_fin && seq == pas->upgrade_seq); +} + +static inline bool +psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas) +{ + return pse && pas->rx.spi == pse->spi && + pas->generation == pse->generation && + pas->version == pse->version && + pas->dev_id == pse->dev_id; +} + +static inline enum skb_drop_reason +__psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); + + if (!pas) + return pse ? SKB_DROP_REASON_PSP_INPUT : 0; + + if (likely(psp_pse_matches_pas(pse, pas))) { + if (unlikely(!pas->peer_tx)) + pas->peer_tx = 1; + + return 0; + } + + if (!pse) { + if (!pas->tx.spi || + (!pas->peer_tx && psp_is_allowed_nondata(skb, pas))) + return 0; + } + + return SKB_DROP_REASON_PSP_INPUT; +} + static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { - return 0; + return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk)); } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { - return 0; + return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); +} + +static inline struct psp_assoc *psp_sk_get_assoc_rcu(struct sock *sk) +{ + struct inet_timewait_sock *tw; + struct psp_assoc *pas; + int state; + + state = 1 << READ_ONCE(sk->sk_state); + if (!sk_is_inet(sk) || state & TCPF_NEW_SYN_RECV) + return NULL; + + tw = inet_twsk(sk); + pas = state & TCPF_TIME_WAIT ? rcu_dereference(tw->psp_assoc) : + rcu_dereference(sk->psp_assoc); + return pas; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { - return NULL; + if (!skb->decrypted || !skb->sk) + return NULL; + + return psp_sk_get_assoc_rcu(skb->sk); } #else static inline void psp_sk_assoc_free(struct sock *sk) { } @@ -60,6 +151,11 @@ static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void psp_reply_set_decrypted(struct sk_buff *skb) { } +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return NULL; +} + static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 66327fa80c92..b0e32e7165a3 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -51,6 +51,7 @@ struct psp_dev_config { * @refcnt: reference count for the instance * @id: instance id * @config: current device configuration + * @active_assocs: list of registered associations * * @rcu: RCU head for freeing the structure */ @@ -68,6 +69,8 @@ struct psp_dev { struct psp_dev_config config; + struct list_head active_assocs; + struct rcu_head rcu; }; @@ -80,6 +83,12 @@ struct psp_dev_caps { * Set this field to 0 to indicate PSP is not supported at all. */ u32 versions; + + /** + * @assoc_drv_spc: size of driver-specific state in Tx assoc + * Determines the size of struct psp_assoc::drv_spc + */ + u32 assoc_drv_spc; }; #define PSP_MAX_KEY 32 @@ -91,6 +100,32 @@ struct psp_skb_ext { u8 version; }; +struct psp_key_parsed { + __be32 spi; + u8 key[PSP_MAX_KEY]; +}; + +struct psp_assoc { + struct psp_dev *psd; + + u16 dev_id; + u8 generation; + u8 version; + u8 peer_tx; + + u32 upgrade_seq; + + struct psp_key_parsed tx; + struct psp_key_parsed rx; + + refcount_t refcnt; + struct rcu_head rcu; + struct work_struct work; + struct list_head assocs_list; + + u8 drv_data[] __aligned(8); +}; + /** * struct psp_dev_ops - netdev driver facing PSP callbacks */ @@ -107,6 +142,28 @@ struct psp_dev_ops { * @key_rotate: rotate the device key */ int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); + + /** + * @rx_spi_alloc: allocate an Rx SPI+key pair + * Allocate an Rx SPI and resulting derived key. + * This key should remain valid until key rotation. + */ + int (*rx_spi_alloc)(struct psp_dev *psd, u32 version, + struct psp_key_parsed *assoc, + struct netlink_ext_ack *extack); + + /** + * @tx_key_add: add a Tx key to the device + * Install an association in the device. Core will allocate space + * for the driver to use at drv_data. + */ + int (*tx_key_add)(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack); + /** + * @tx_key_del: remove a Tx key from the device + * Remove an association from the device. + */ + void (*tx_key_del)(struct psp_dev *psd, struct psp_assoc *pas); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index cbfbf3f0f364..607c42c39ba5 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -26,6 +26,25 @@ enum { PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) }; +enum { + PSP_A_ASSOC_DEV_ID = 1, + PSP_A_ASSOC_VERSION, + PSP_A_ASSOC_RX_KEY, + PSP_A_ASSOC_TX_KEY, + PSP_A_ASSOC_SOCK_FD, + + __PSP_A_ASSOC_MAX, + PSP_A_ASSOC_MAX = (__PSP_A_ASSOC_MAX - 1) +}; + +enum { + PSP_A_KEYS_KEY = 1, + PSP_A_KEYS_SPI, + + __PSP_A_KEYS_MAX, + PSP_A_KEYS_MAX = (__PSP_A_KEYS_MAX - 1) +}; + enum { PSP_CMD_DEV_GET = 1, PSP_CMD_DEV_ADD_NTF, @@ -34,6 +53,8 @@ enum { PSP_CMD_DEV_CHANGE_NTF, PSP_CMD_KEY_ROTATE, PSP_CMD_KEY_ROTATE_NTF, + PSP_CMD_RX_ASSOC, + PSP_CMD_TX_ASSOC, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) diff --git a/net/psp/Kconfig b/net/psp/Kconfig index 5e3908a40945..a7d24691a7e1 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -6,6 +6,7 @@ config INET_PSP bool "PSP Security Protocol support" depends on INET select SKB_DECRYPTED + select SOCK_VALIDATE_XMIT help Enable kernel support for the PSP protocol. For more information see: diff --git a/net/psp/Makefile b/net/psp/Makefile index 41b51d06e560..eb5ff3c5bfb2 100644 --- a/net/psp/Makefile +++ b/net/psp/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_INET_PSP) += psp.o -psp-y := psp_main.o psp_nl.o psp-nl-gen.o +psp-y := psp_main.o psp_nl.o psp_sock.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 7f49577ac72f..9fdd6f831803 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -10,6 +10,12 @@ #include +/* Common nested types */ +const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1] = { + [PSP_A_KEYS_KEY] = { .type = NLA_BINARY, }, + [PSP_A_KEYS_SPI] = { .type = NLA_U32, }, +}; + /* PSP_CMD_DEV_GET - do */ static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), @@ -26,6 +32,21 @@ static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* PSP_CMD_RX_ASSOC - do */ +static const struct nla_policy psp_rx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + +/* PSP_CMD_TX_ASSOC - do */ +static const struct nla_policy psp_tx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_TX_KEY] = NLA_POLICY_NESTED(psp_keys_nl_policy), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -60,6 +81,24 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_DEV_ID, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_RX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_rx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_rx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_TX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_tx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_tx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index 00a2d4ec59e4..25268ed11fb5 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -11,8 +11,13 @@ #include +/* Common nested types */ +extern const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1]; + int psp_device_get_locked(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); @@ -21,6 +26,8 @@ int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info); enum { PSP_NLGRP_MGMT, diff --git a/net/psp/psp.h b/net/psp/psp.h index 94d0cc31a61f..defd3e3fd5e7 100644 --- a/net/psp/psp.h +++ b/net/psp/psp.h @@ -4,6 +4,7 @@ #define __PSP_PSP_H #include +#include #include #include #include @@ -17,15 +18,36 @@ int psp_dev_check_access(struct psp_dev *psd, struct net *net); void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); +struct psp_assoc *psp_assoc_create(struct psp_dev *psd); +struct psp_dev *psp_dev_get_for_sock(struct sock *sk); +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack); +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack); + static inline void psp_dev_get(struct psp_dev *psd) { refcount_inc(&psd->refcnt); } +static inline bool psp_dev_tryget(struct psp_dev *psd) +{ + return refcount_inc_not_zero(&psd->refcnt); +} + static inline void psp_dev_put(struct psp_dev *psd) { if (refcount_dec_and_test(&psd->refcnt)) psp_dev_destroy(psd); } +static inline bool psp_dev_is_registered(struct psp_dev *psd) +{ + lockdep_assert_held(&psd->lock); + return !!psd->ops; +} + #endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index f60155493afc..a1ae3c8920c3 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -55,7 +55,10 @@ psp_dev_create(struct net_device *netdev, if (WARN_ON(!psd_caps->versions || !psd_ops->set_config || - !psd_ops->key_rotate)) + !psd_ops->key_rotate || + !psd_ops->rx_spi_alloc || + !psd_ops->tx_key_add || + !psd_ops->tx_key_del)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); @@ -68,6 +71,7 @@ psp_dev_create(struct net_device *netdev, psd->drv_priv = priv_ptr; mutex_init(&psd->lock); + INIT_LIST_HEAD(&psd->active_assocs); refcount_set(&psd->refcnt, 1); mutex_lock(&psp_devs_lock); @@ -107,6 +111,8 @@ void psp_dev_destroy(struct psp_dev *psd) */ void psp_dev_unregister(struct psp_dev *psd) { + struct psp_assoc *pas, *next; + mutex_lock(&psp_devs_lock); mutex_lock(&psd->lock); @@ -119,6 +125,9 @@ void psp_dev_unregister(struct psp_dev *psd) xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); mutex_unlock(&psp_devs_lock); + list_for_each_entry_safe(pas, next, &psd->active_assocs, assocs_list) + psp_dev_tx_key_del(psd, pas); + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); psd->ops = NULL; @@ -130,6 +139,21 @@ void psp_dev_unregister(struct psp_dev *psd) } EXPORT_SYMBOL(psp_dev_unregister); +unsigned int psp_key_size(u32 version) +{ + switch (version) { + case PSP_VERSION_HDR0_AES_GCM_128: + case PSP_VERSION_HDR0_AES_GMAC_128: + return 16; + case PSP_VERSION_HDR0_AES_GCM_256: + case PSP_VERSION_HDR0_AES_GMAC_256: + return 32; + default: + return 0; + } +} +EXPORT_SYMBOL(psp_key_size); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index 75f2702c1029..1b1d08fce637 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -79,9 +79,12 @@ void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct socket *socket = info->user_ptr[1]; struct psp_dev *psd = info->user_ptr[0]; mutex_unlock(&psd->lock); + if (socket) + sockfd_put(socket); } static int @@ -261,3 +264,232 @@ err_free_rsp: nlmsg_free(rsp); return err; } + +/* Key etc. */ + +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket; + struct psp_dev *psd; + struct nlattr *id; + int fd, err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_SOCK_FD)) + return -EINVAL; + + fd = nla_get_u32(info->attrs[PSP_A_ASSOC_SOCK_FD]); + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + if (!sk_is_tcp(socket->sk)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[PSP_A_ASSOC_SOCK_FD], + "Unsupported socket family and type"); + err = -EOPNOTSUPP; + goto err_sock_put; + } + + psd = psp_dev_get_for_sock(socket->sk); + if (psd) { + err = psp_dev_check_access(psd, genl_info_net(info)); + if (err) { + psp_dev_put(psd); + psd = NULL; + } + } + + if (!psd && GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_DEV_ID)) { + err = -EINVAL; + goto err_sock_put; + } + + id = info->attrs[PSP_A_ASSOC_DEV_ID]; + if (psd) { + mutex_lock(&psd->lock); + if (id && psd->id != nla_get_u32(id)) { + mutex_unlock(&psd->lock); + NL_SET_ERR_MSG_ATTR(info->extack, id, + "Device id vs socket mismatch"); + err = -EINVAL; + goto err_psd_put; + } + + psp_dev_put(psd); + } else { + psd = psp_device_get_and_lock(genl_info_net(info), id); + if (IS_ERR(psd)) { + err = PTR_ERR(psd); + goto err_sock_put; + } + } + + info->user_ptr[0] = psd; + info->user_ptr[1] = socket; + + return 0; + +err_psd_put: + psp_dev_put(psd); +err_sock_put: + sockfd_put(socket); + return err; +} + +static int +psp_nl_parse_key(struct genl_info *info, u32 attr, struct psp_key_parsed *key, + unsigned int key_sz) +{ + struct nlattr *nest = info->attrs[attr]; + struct nlattr *tb[PSP_A_KEYS_SPI + 1]; + u32 spi; + int err; + + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + psp_keys_nl_policy, info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_KEY) || + NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_SPI)) + return -EINVAL; + + if (nla_len(tb[PSP_A_KEYS_KEY]) != key_sz) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "incorrect key length"); + return -EINVAL; + } + + spi = nla_get_u32(tb[PSP_A_KEYS_SPI]); + if (!(spi & PSP_SPI_KEY_ID)) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "invalid SPI: lower 31b must be non-zero"); + return -EINVAL; + } + + key->spi = cpu_to_be32(spi); + memcpy(key->key, nla_data(tb[PSP_A_KEYS_KEY]), key_sz); + + return 0; +} + +static int +psp_nl_put_key(struct sk_buff *skb, u32 attr, u32 version, + struct psp_key_parsed *key) +{ + int key_sz = psp_key_size(version); + void *nest; + + nest = nla_nest_start(skb, attr); + + if (nla_put_u32(skb, PSP_A_KEYS_SPI, be32_to_cpu(key->spi)) || + nla_put(skb, PSP_A_KEYS_KEY, key_sz, key->key)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + + return 0; +} + +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct psp_assoc *pas; + struct sk_buff *rsp; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + pas = psp_assoc_create(psd); + if (!pas) { + err = -ENOMEM; + goto err_free_rsp; + } + pas->version = version; + + err = psd->ops->rx_spi_alloc(psd, version, &key, info->extack); + if (err) + goto err_free_pas; + + if (nla_put_u32(rsp, PSP_A_ASSOC_DEV_ID, psd->id) || + psp_nl_put_key(rsp, PSP_A_ASSOC_RX_KEY, version, &key)) { + err = -EMSGSIZE; + goto err_free_pas; + } + + err = psp_sock_assoc_set_rx(socket->sk, pas, &key, info->extack); + if (err) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_SOCK_FD]); + goto err_free_pas; + } + psp_assoc_put(pas); + + return psp_nl_reply_send(rsp, info); + +err_free_pas: + psp_assoc_put(pas); +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct sk_buff *rsp; + unsigned int key_sz; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION) || + GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_TX_KEY)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + key_sz = psp_key_size(version); + if (!key_sz) + return -EINVAL; + + err = psp_nl_parse_key(info, PSP_A_ASSOC_TX_KEY, &key, key_sz); + if (err < 0) + return err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + err = psp_sock_assoc_set_tx(socket->sk, psd, version, &key, + info->extack); + if (err) + goto err_free_msg; + + return psp_nl_reply_send(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c new file mode 100644 index 000000000000..8ebccee94593 --- /dev/null +++ b/net/psp/psp_sock.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include +#include "psp.h" + +struct psp_dev *psp_dev_get_for_sock(struct sock *sk) +{ + struct dst_entry *dst; + struct psp_dev *psd; + + dst = sk_dst_get(sk); + if (!dst) + return NULL; + + rcu_read_lock(); + psd = rcu_dereference(dst->dev->psp_dev); + if (psd && !psp_dev_tryget(psd)) + psd = NULL; + rcu_read_unlock(); + + dst_release(dst); + + return psd; +} + +static struct sk_buff * +psp_validate_xmit(struct sock *sk, struct net_device *dev, struct sk_buff *skb) +{ + struct psp_assoc *pas; + bool good; + + rcu_read_lock(); + pas = psp_skb_get_assoc_rcu(skb); + good = !pas || rcu_access_pointer(dev->psp_dev) == pas->psd; + rcu_read_unlock(); + if (!good) { + kfree_skb_reason(skb, SKB_DROP_REASON_PSP_OUTPUT); + return NULL; + } + + return skb; +} + +struct psp_assoc *psp_assoc_create(struct psp_dev *psd) +{ + struct psp_assoc *pas; + + lockdep_assert_held(&psd->lock); + + pas = kzalloc(struct_size(pas, drv_data, psd->caps->assoc_drv_spc), + GFP_KERNEL_ACCOUNT); + if (!pas) + return NULL; + + pas->psd = psd; + pas->dev_id = psd->id; + psp_dev_get(psd); + refcount_set(&pas->refcnt, 1); + + list_add_tail(&pas->assocs_list, &psd->active_assocs); + + return pas; +} + +static struct psp_assoc *psp_assoc_dummy(struct psp_assoc *pas) +{ + struct psp_dev *psd = pas->psd; + size_t sz; + + lockdep_assert_held(&psd->lock); + + sz = struct_size(pas, drv_data, psd->caps->assoc_drv_spc); + return kmemdup(pas, sz, GFP_KERNEL); +} + +static int psp_dev_tx_key_add(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack) +{ + return psd->ops->tx_key_add(psd, pas, extack); +} + +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas) +{ + if (pas->tx.spi) + psd->ops->tx_key_del(psd, pas); + list_del(&pas->assocs_list); +} + +static void psp_assoc_free(struct work_struct *work) +{ + struct psp_assoc *pas = container_of(work, struct psp_assoc, work); + struct psp_dev *psd = pas->psd; + + mutex_lock(&psd->lock); + if (psd->ops) + psp_dev_tx_key_del(psd, pas); + mutex_unlock(&psd->lock); + psp_dev_put(psd); + kfree(pas); +} + +static void psp_assoc_free_queue(struct rcu_head *head) +{ + struct psp_assoc *pas = container_of(head, struct psp_assoc, rcu); + + INIT_WORK(&pas->work, psp_assoc_free); + schedule_work(&pas->work); +} + +/** + * psp_assoc_put() - release a reference on a PSP association + * @pas: association to release + */ +void psp_assoc_put(struct psp_assoc *pas) +{ + if (pas && refcount_dec_and_test(&pas->refcnt)) + call_rcu(&pas->rcu, psp_assoc_free_queue); +} + +void psp_sk_assoc_free(struct sock *sk) +{ + struct psp_assoc *pas = rcu_dereference_protected(sk->psp_assoc, 1); + + rcu_assign_pointer(sk->psp_assoc, NULL); + psp_assoc_put(pas); +} + +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + int err; + + memcpy(&pas->rx, key, sizeof(*key)); + + lock_sock(sk); + + if (psp_sk_assoc(sk)) { + NL_SET_ERR_MSG(extack, "Socket already has PSP state"); + err = -EBUSY; + goto exit_unlock; + } + + refcount_inc(&pas->refcnt); + rcu_assign_pointer(sk->psp_assoc, pas); + err = 0; + +exit_unlock: + release_sock(sk); + + return err; +} + +static int psp_sock_recv_queue_check(struct sock *sk, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse; + struct sk_buff *skb; + + skb_rbtree_walk(skb, &tcp_sk(sk)->out_of_order_queue) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + + skb_queue_walk(&sk->sk_receive_queue, skb) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + return 0; +} + +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + struct psp_assoc *pas, *dummy; + int err; + + lock_sock(sk); + + pas = psp_sk_assoc(sk); + if (!pas) { + NL_SET_ERR_MSG(extack, "Socket has no Rx key"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->psd != psd) { + NL_SET_ERR_MSG(extack, "Rx key from different device"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->version != version) { + NL_SET_ERR_MSG(extack, + "PSP version mismatch with existing state"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->tx.spi) { + NL_SET_ERR_MSG(extack, "Tx key already set"); + err = -EBUSY; + goto exit_unlock; + } + + err = psp_sock_recv_queue_check(sk, pas); + if (err) { + NL_SET_ERR_MSG(extack, "Socket has incompatible segments already in the recv queue"); + goto exit_unlock; + } + + /* Pass a fake association to drivers to make sure they don't + * try to store pointers to it. For re-keying we'll need to + * re-allocate the assoc structures. + */ + dummy = psp_assoc_dummy(pas); + if (!dummy) { + err = -ENOMEM; + goto exit_unlock; + } + + memcpy(&dummy->tx, key, sizeof(*key)); + err = psp_dev_tx_key_add(psd, dummy, extack); + if (err) + goto exit_free_dummy; + + memcpy(pas->drv_data, dummy->drv_data, psd->caps->assoc_drv_spc); + memcpy(&pas->tx, key, sizeof(*key)); + + WRITE_ONCE(sk->sk_validate_xmit_skb, psp_validate_xmit); + tcp_write_collapse_fence(sk); + pas->upgrade_seq = tcp_sk(sk)->rcv_nxt; + +exit_free_dummy: + kfree(dummy); +exit_unlock: + release_sock(sk); + return err; +} + +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) +{ + struct psp_assoc *pas = psp_sk_assoc(sk); + + if (pas) + refcount_inc(&pas->refcnt); + rcu_assign_pointer(tw->psp_assoc, pas); + tw->tw_validate_xmit_skb = psp_validate_xmit; +} + +void psp_twsk_assoc_free(struct inet_timewait_sock *tw) +{ + struct psp_assoc *pas = rcu_dereference_protected(tw->psp_assoc, 1); + + rcu_assign_pointer(tw->psp_assoc, NULL); + psp_assoc_put(pas); +} + +void psp_reply_set_decrypted(struct sk_buff *skb) +{ + struct psp_assoc *pas; + + rcu_read_lock(); + pas = psp_sk_get_assoc_rcu(skb->sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; + rcu_read_unlock(); +} +EXPORT_IPV6_MOD_GPL(psp_reply_set_decrypted); -- cgit v1.2.3 From e97269257fe437910cddc7c642a636ca3cf9fb1d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:36 -0700 Subject: net: psp: update the TCP MSS to reflect PSP packet overhead PSP eats 40B of header space. Adjust MSS appropriately. We can either modify tcp_mtu_to_mss() / tcp_mss_to_mtu() or reuse icsk_ext_hdr_len. The former option is more TCP specific and has runtime overhead. The latter is a bit of a hack as PSP is not an ext_hdr. If one squints hard enough, UDP encap is just a more practical version of IPv6 exthdr, so go with the latter. Happy to change. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-10-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/functions.h | 14 ++++++++++++++ include/net/psp/types.h | 3 +++ net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/ipv6_sockglue.c | 6 +++++- net/ipv6/tcp_ipv6.c | 6 +++--- net/psp/psp_sock.c | 5 +++++ 6 files changed, 32 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 0d7141230f47..183a3c9216b7 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -143,6 +144,14 @@ static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) return psp_sk_get_assoc_rcu(skb->sk); } + +static inline unsigned int psp_sk_overhead(const struct sock *sk) +{ + int psp_encap = sizeof(struct udphdr) + PSP_HDR_SIZE + PSP_TRL_SIZE; + bool has_psp = rcu_access_pointer(sk->psp_assoc); + + return has_psp ? psp_encap : 0; +} #else static inline void psp_sk_assoc_free(struct sock *sk) { } static inline void @@ -182,6 +191,11 @@ static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { return NULL; } + +static inline unsigned int psp_sk_overhead(const struct sock *sk) +{ + return 0; +} #endif static inline unsigned long diff --git a/include/net/psp/types.h b/include/net/psp/types.h index b0e32e7165a3..f93ad0e6c04f 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -93,6 +93,9 @@ struct psp_dev_caps { #define PSP_MAX_KEY 32 +#define PSP_HDR_SIZE 16 /* We don't support optional fields, yet */ +#define PSP_TRL_SIZE 16 /* AES-GCM/GMAC trailer size */ + struct psp_skb_ext { __be32 spi; u16 dev_id; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f27f6f865a48..b1fcf3e4e1ce 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -294,9 +294,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; sk_daddr_set(sk, daddr); - inet_csk(sk)->icsk_ext_hdr_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e66ec623972e..a61e742794f9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -107,7 +108,10 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; + + icsk->icsk_ext_hdr_len = + psp_sk_overhead(sk) + + opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4da8eb9183d7..43d1109e2180 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -302,10 +302,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); - icsk->icsk_ext_hdr_len = 0; + icsk->icsk_ext_hdr_len = psp_sk_overhead(sk); if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + - opt->opt_nflen; + icsk->icsk_ext_hdr_len += opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index 8ebccee94593..10e1fda30aa0 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -180,6 +180,7 @@ int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, u32 version, struct psp_key_parsed *key, struct netlink_ext_ack *extack) { + struct inet_connection_sock *icsk; struct psp_assoc *pas, *dummy; int err; @@ -236,6 +237,10 @@ int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, tcp_write_collapse_fence(sk); pas->upgrade_seq = tcp_sk(sk)->rcv_nxt; + icsk = inet_csk(sk); + icsk->icsk_ext_hdr_len += psp_sk_overhead(sk); + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); + exit_free_dummy: kfree(dummy); exit_unlock: -- cgit v1.2.3 From e78851058b35deb9f2d60ecf698fbf7ae7790d09 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:37 -0700 Subject: psp: track generations of device key There is a (somewhat theoretical in absence of multi-host support) possibility that another entity will rotate the key and we won't know. This may lead to accepting packets with matching SPI but which used different crypto keys than we expected. The PSP Architecture specification mentions that an implementation should track device key generation when device keys are managed by the NIC. Some PSP implementations may opt to include this key generation state in decryption metadata each time a device key is used to decrypt a packet. If that is the case, that key generation counter can also be used when policy checking a decrypted skb against a psp_assoc. This is an optional feature that is not explicitly part of the PSP spec, but can provide additional security in the case where an attacker may have the ability to force key rotations faster than rekeying can occur. Since we're tracking "key generations" more explicitly now, maintain different lists for associations from different generations. This way we can catch stale associations (the user space should listen to rotation notifications and change the keys). Drivers can "opt out" of generation tracking by setting the generation value to 0. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-11-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/types.h | 10 ++++++++++ net/psp/psp.h | 1 + net/psp/psp_main.c | 6 +++++- net/psp/psp_nl.c | 10 ++++++++++ net/psp/psp_sock.c | 16 ++++++++++++++++ 5 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/psp/types.h b/include/net/psp/types.h index f93ad0e6c04f..ec218747ced0 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -50,8 +50,12 @@ struct psp_dev_config { * @lock: instance lock, protects all fields * @refcnt: reference count for the instance * @id: instance id + * @generation: current generation of the device key * @config: current device configuration * @active_assocs: list of registered associations + * @prev_assocs: associations which use old (but still usable) + * device key + * @stale_assocs: associations which use a rotated out key * * @rcu: RCU head for freeing the structure */ @@ -67,13 +71,19 @@ struct psp_dev { u32 id; + u8 generation; + struct psp_dev_config config; struct list_head active_assocs; + struct list_head prev_assocs; + struct list_head stale_assocs; struct rcu_head rcu; }; +#define PSP_GEN_VALID_MASK 0x7f + /** * struct psp_dev_caps - PSP device capabilities */ diff --git a/net/psp/psp.h b/net/psp/psp.h index defd3e3fd5e7..0f34e1a23fdd 100644 --- a/net/psp/psp.h +++ b/net/psp/psp.h @@ -27,6 +27,7 @@ int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, u32 version, struct psp_key_parsed *key, struct netlink_ext_ack *extack); +void psp_assocs_key_rotated(struct psp_dev *psd); static inline void psp_dev_get(struct psp_dev *psd) { diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index a1ae3c8920c3..98ad8c85b58e 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -72,6 +72,8 @@ psp_dev_create(struct net_device *netdev, mutex_init(&psd->lock); INIT_LIST_HEAD(&psd->active_assocs); + INIT_LIST_HEAD(&psd->prev_assocs); + INIT_LIST_HEAD(&psd->stale_assocs); refcount_set(&psd->refcnt, 1); mutex_lock(&psp_devs_lock); @@ -125,7 +127,9 @@ void psp_dev_unregister(struct psp_dev *psd) xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); mutex_unlock(&psp_devs_lock); - list_for_each_entry_safe(pas, next, &psd->active_assocs, assocs_list) + list_splice_init(&psd->active_assocs, &psd->prev_assocs); + list_splice_init(&psd->prev_assocs, &psd->stale_assocs); + list_for_each_entry_safe(pas, next, &psd->stale_assocs, assocs_list) psp_dev_tx_key_del(psd, pas); rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index 1b1d08fce637..8aaca62744c3 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -230,6 +230,7 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) struct psp_dev *psd = info->user_ptr[0]; struct genl_info ntf_info; struct sk_buff *ntf, *rsp; + u8 prev_gen; int err; rsp = psp_nl_reply_new(info); @@ -249,10 +250,19 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) goto err_free_ntf; } + /* suggest the next gen number, driver can override */ + prev_gen = psd->generation; + psd->generation = (prev_gen + 1) & PSP_GEN_VALID_MASK; + err = psd->ops->key_rotate(psd, info->extack); if (err) goto err_free_ntf; + WARN_ON_ONCE((psd->generation && psd->generation == prev_gen) || + psd->generation & ~PSP_GEN_VALID_MASK); + + psp_assocs_key_rotated(psd); + nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, 0, PSP_NLGRP_USE, GFP_KERNEL); diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index 10e1fda30aa0..afa966c6b69d 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -60,6 +60,7 @@ struct psp_assoc *psp_assoc_create(struct psp_dev *psd) pas->psd = psd; pas->dev_id = psd->id; + pas->generation = psd->generation; psp_dev_get(psd); refcount_set(&pas->refcnt, 1); @@ -248,6 +249,21 @@ exit_unlock: return err; } +void psp_assocs_key_rotated(struct psp_dev *psd) +{ + struct psp_assoc *pas, *next; + + /* Mark the stale associations as invalid, they will no longer + * be able to Rx any traffic. + */ + list_for_each_entry_safe(pas, next, &psd->prev_assocs, assocs_list) + pas->generation |= ~PSP_GEN_VALID_MASK; + list_splice_init(&psd->prev_assocs, &psd->stale_assocs); + list_splice_init(&psd->active_assocs, &psd->prev_assocs); + + /* TODO: we should inform the sockets that got shut down */ +} + void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { struct psp_assoc *pas = psp_sk_assoc(sk); -- cgit v1.2.3 From fc724515741a1b86ca0457825fdb784ab038e92c Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Tue, 16 Sep 2025 17:09:40 -0700 Subject: psp: provide encapsulation helper for drivers Create a new function psp_encapsulate(), which takes a TCP packet and PSP encapsulates it according to the "Transport Mode Packet Format" section of the PSP Architecture Specification. psp_encapsulate() does not push a PSP trailer onto the skb. Both IPv6 and IPv4 are supported. Virtualization cookie is not included. Reviewed-by: Willem de Bruijn Signed-off-by: Raed Salem Signed-off-by: Rahul Rameshbabu Signed-off-by: Cosmin Ratiu Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-14-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/functions.h | 2 ++ include/net/psp/types.h | 2 ++ net/psp/psp_main.c | 65 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) (limited to 'include/net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 183a3c9216b7..0a539e1b39f4 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -17,6 +17,8 @@ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr); void psp_dev_unregister(struct psp_dev *psd); +bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, __be16 sport); /* Kernel-facing API */ void psp_assoc_put(struct psp_assoc *pas); diff --git a/include/net/psp/types.h b/include/net/psp/types.h index ec218747ced0..d9688e66cf09 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -20,6 +20,8 @@ struct psphdr { __be64 vc[]; /* optional */ }; +#define PSP_ENCAP_HLEN (sizeof(struct udphdr) + sizeof(struct psphdr)) + #define PSP_SPI_KEY_ID GENMASK(30, 0) #define PSP_SPI_KEY_PHASE BIT(31) diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index 98ad8c85b58e..e026880fa1a2 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include #include #include +#include #include "psp.h" #include "psp-nl-gen.h" @@ -158,6 +160,69 @@ unsigned int psp_key_size(u32 version) } EXPORT_SYMBOL(psp_key_size); +static void psp_write_headers(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, unsigned int udp_len, __be16 sport) +{ + struct udphdr *uh = udp_hdr(skb); + struct psphdr *psph = (struct psphdr *)(uh + 1); + + uh->dest = htons(PSP_DEFAULT_UDP_PORT); + uh->source = udp_flow_src_port(net, skb, 0, 0, false); + uh->check = 0; + uh->len = htons(udp_len); + + psph->nexthdr = IPPROTO_TCP; + psph->hdrlen = PSP_HDRLEN_NOOPT; + psph->crypt_offset = 0; + psph->verfl = FIELD_PREP(PSPHDR_VERFL_VERSION, ver) | + FIELD_PREP(PSPHDR_VERFL_ONE, 1); + psph->spi = spi; + memset(&psph->iv, 0, sizeof(psph->iv)); +} + +/* Encapsulate a TCP packet with PSP by adding the UDP+PSP headers and filling + * them in. + */ +bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, __be16 sport) +{ + u32 network_len = skb_network_header_len(skb); + u32 ethr_len = skb_mac_header_len(skb); + u32 bufflen = ethr_len + network_len; + + if (skb_cow_head(skb, PSP_ENCAP_HLEN)) + return false; + + skb_push(skb, PSP_ENCAP_HLEN); + skb->mac_header -= PSP_ENCAP_HLEN; + skb->network_header -= PSP_ENCAP_HLEN; + skb->transport_header -= PSP_ENCAP_HLEN; + memmove(skb->data, skb->data + PSP_ENCAP_HLEN, bufflen); + + if (skb->protocol == htons(ETH_P_IP)) { + ip_hdr(skb)->protocol = IPPROTO_UDP; + be16_add_cpu(&ip_hdr(skb)->tot_len, PSP_ENCAP_HLEN); + ip_hdr(skb)->check = 0; + ip_hdr(skb)->check = + ip_fast_csum((u8 *)ip_hdr(skb), ip_hdr(skb)->ihl); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + ipv6_hdr(skb)->nexthdr = IPPROTO_UDP; + be16_add_cpu(&ipv6_hdr(skb)->payload_len, PSP_ENCAP_HLEN); + } else { + return false; + } + + skb_set_inner_ipproto(skb, IPPROTO_TCP); + skb_set_inner_transport_header(skb, skb_transport_offset(skb) + + PSP_ENCAP_HLEN); + skb->encapsulation = 1; + psp_write_headers(net, skb, spi, ver, + skb->len - skb_transport_offset(skb), sport); + + return true; +} +EXPORT_SYMBOL(psp_dev_encapsulate); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); -- cgit v1.2.3 From 0eddb8023cee546eb05658ef3322234de8461f3b Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Tue, 16 Sep 2025 17:09:44 -0700 Subject: psp: provide decapsulation and receive helper for drivers Create psp_dev_rcv(), which drivers can call to psp decapsulate and attach a psp_skb_ext to an skb. psp_dev_rcv() only supports what the PSP architecture specification refers to as "transport mode" packets, where the L3 header is either IPv6 or IPv4. Reviewed-by: Willem de Bruijn Signed-off-by: Raed Salem Signed-off-by: Rahul Rameshbabu Signed-off-by: Cosmin Ratiu Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-18-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/functions.h | 1 + net/psp/psp_main.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) (limited to 'include/net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 0a539e1b39f4..91ba06733321 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -19,6 +19,7 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, u8 ver, __be16 sport); +int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv); /* Kernel-facing API */ void psp_assoc_put(struct psp_assoc *pas); diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index e026880fa1a2..b4b756f87382 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -223,6 +223,94 @@ bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, } EXPORT_SYMBOL(psp_dev_encapsulate); +/* Receive handler for PSP packets. + * + * Presently it accepts only already-authenticated packets and does not + * support optional fields, such as virtualization cookies. The caller should + * ensure that skb->data is pointing to the mac header, and that skb->mac_len + * is set. + */ +int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) +{ + int l2_hlen = 0, l3_hlen, encap; + struct psp_skb_ext *pse; + struct psphdr *psph; + struct ethhdr *eth; + struct udphdr *uh; + __be16 proto; + bool is_udp; + + eth = (struct ethhdr *)skb->data; + proto = __vlan_get_protocol(skb, eth->h_proto, &l2_hlen); + if (proto == htons(ETH_P_IP)) + l3_hlen = sizeof(struct iphdr); + else if (proto == htons(ETH_P_IPV6)) + l3_hlen = sizeof(struct ipv6hdr); + else + return -EINVAL; + + if (unlikely(!pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN))) + return -EINVAL; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); + + is_udp = iph->protocol == IPPROTO_UDP; + l3_hlen = iph->ihl * 4; + if (l3_hlen != sizeof(struct iphdr) && + !pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN)) + return -EINVAL; + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); + + is_udp = ipv6h->nexthdr == IPPROTO_UDP; + } + + if (unlikely(!is_udp)) + return -EINVAL; + + uh = (struct udphdr *)(skb->data + l2_hlen + l3_hlen); + if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT))) + return -EINVAL; + + pse = skb_ext_add(skb, SKB_EXT_PSP); + if (!pse) + return -EINVAL; + + psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + + sizeof(struct udphdr)); + pse->spi = psph->spi; + pse->dev_id = dev_id; + pse->generation = generation; + pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl); + + encap = PSP_ENCAP_HLEN; + encap += strip_icv ? PSP_TRL_SIZE : 0; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); + + iph->protocol = psph->nexthdr; + iph->tot_len = htons(ntohs(iph->tot_len) - encap); + iph->check = 0; + iph->check = ip_fast_csum((u8 *)iph, iph->ihl); + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); + + ipv6h->nexthdr = psph->nexthdr; + ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap); + } + + memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen); + skb_pull(skb, PSP_ENCAP_HLEN); + + if (strip_icv) + pskb_trim(skb, skb->len - PSP_TRL_SIZE); + + return 0; +} +EXPORT_SYMBOL(psp_dev_rcv); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); -- cgit v1.2.3 From 87ebb628a5acb892eba41ef1d8989beb8f036034 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2025 13:53:37 +0000 Subject: net: clear sk->sk_ino in sk_set_socket(sk, NULL) Andrei Vagin reported that blamed commit broke CRIU. Indeed, while we want to keep sk_uid unchanged when a socket is cloned, we want to clear sk->sk_ino. Otherwise, sock_diag might report multiple sockets sharing the same inode number. Move the clearing part from sock_orphan() to sk_set_socket(sk, NULL), called both from sock_orphan() and sk_clone_lock(). Fixes: 5d6b58c932ec ("net: lockless sock_i_ino()") Closes: https://lore.kernel.org/netdev/aMhX-VnXkYDpKd9V@google.com/ Closes: https://github.com/checkpoint-restore/criu/issues/2744 Reported-by: Andrei Vagin Signed-off-by: Eric Dumazet Acked-by: Andrei Vagin Link: https://patch.msgid.link/20250917135337.1736101-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index fb13322a11fc..2e14283c5be1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2061,6 +2061,9 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) if (sock) { WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } else { + /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); } } @@ -2082,8 +2085,6 @@ static inline void sock_orphan(struct sock *sk) sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; - /* Note: sk_uid is unchanged. */ - WRITE_ONCE(sk->sk_ino, 0); write_unlock_bh(&sk->sk_callback_lock); } -- cgit v1.2.3 From 01b4a3061b1d4ded108e1a700b4414c00662954c Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:55 +0300 Subject: wifi: nl80211: Add more configuration options for NAN commands Current NAN APIs have only basic configuration for master preference and operating bands. Add and parse additional parameters which provide more control over NAN synchronization. The newly added attributes allow to publish additional NAN attributes and vendor elements in NAN beacons, control scan and discovery beacons periodicity, enable/disable DW notifications etc. Signed-off-by: Andrei Otcheretianski tested: Miriam Rachel Korenblit Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.a4779492bf8e.I375feb919bd72358173766b9fe10010c40796b33@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 60 +++++++++ include/uapi/linux/nl80211.h | 110 +++++++++++++++- net/wireless/nl80211.c | 298 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 431 insertions(+), 37 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4072a67c9cc9..e2f4ca500ea3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3912,6 +3912,38 @@ struct cfg80211_qos_map { struct cfg80211_dscp_range up[8]; }; +/** + * struct cfg80211_nan_band_config - NAN band specific configuration + * + * @chan: Pointer to the IEEE 802.11 channel structure. The channel to be used + * for NAN operations on this band. For 2.4 GHz band, this is always + * channel 6. For 5 GHz band, the channel is either 44 or 149, according + * to the regulatory constraints. If chan pointer is NULL the entire band + * configuration entry is considered invalid and should not be used. + * @rssi_close: RSSI close threshold used for NAN state transition algorithm + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should + * be greater than -60 dBm. + * @rssi_middle: RSSI middle threshold used for NAN state transition algorithm. + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should be + * greater than -75 dBm and less than rssi_close. + * @awake_dw_interval: Committed DW interval. Valid values range: 0-5. 0 + * indicates no wakeup for DW and can't be used on 2.4GHz band, otherwise + * 2^(n-1). + * @disable_scan: If true, the device will not scan this band for cluster + * merge. Disabling scan on 2.4 GHz band is not allowed. + */ +struct cfg80211_nan_band_config { + struct ieee80211_channel *chan; + s8 rssi_close; + s8 rssi_middle; + u8 awake_dw_interval; + bool disable_scan; +}; + /** * struct cfg80211_nan_conf - NAN configuration * @@ -3921,10 +3953,31 @@ struct cfg80211_qos_map { * @bands: operating bands, a bitmap of &enum nl80211_band values. * For instance, for NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @cluster_id: cluster ID used for NAN synchronization. This is a MAC address + * that can take a value from 50-6F-9A-01-00-00 to 50-6F-9A-01-FF-FF. + * If NULL, the device will pick a random Cluster ID. + * @scan_period: period (in seconds) between NAN scans. + * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. + * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @band_cfgs: array of band specific configurations, indexed by + * &enum nl80211_band values. + * @extra_nan_attrs: pointer to additional NAN attributes. + * @extra_nan_attrs_len: length of the additional NAN attributes. + * @vendor_elems: pointer to vendor-specific elements. + * @vendor_elems_len: length of the vendor-specific elements. */ struct cfg80211_nan_conf { u8 master_pref; u8 bands; + const u8 *cluster_id; + u16 scan_period; + u16 scan_dwell_time; + u8 discovery_beacon_interval; + struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; + const u8 *extra_nan_attrs; + u16 extra_nan_attrs_len; + const u8 *vendor_elems; + u16 vendor_elems_len; }; /** @@ -3933,10 +3986,17 @@ struct cfg80211_nan_conf { * * @CFG80211_NAN_CONF_CHANGED_PREF: master preference * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands + * @CFG80211_NAN_CONF_CHANGED_CONFIG: changed additional configuration. + * When this flag is set, it indicates that some additional attribute(s) + * (other then master_pref and bands) have been changed. In this case, + * all the unchanged attributes will be properly configured to their + * previous values. The driver doesn't need to store any + * previous configuration besides master_pref and bands. */ enum cfg80211_nan_conf_changes { CFG80211_NAN_CONF_CHANGED_PREF = BIT(0), CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1), + CFG80211_NAN_CONF_CHANGED_CONFIG = BIT(2), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index aed0b4c5d5e8..20b8202a3d58 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1085,8 +1085,9 @@ * %NL80211_ATTR_NAN_MASTER_PREF attribute and optional * %NL80211_ATTR_BANDS attributes. If %NL80211_ATTR_BANDS is * omitted or set to 0, it means don't-care and the device will - * decide what to use. After this command NAN functions can be - * added. + * decide what to use. Additional cluster configuration may be + * optionally provided with %NL80211_ATTR_NAN_CONFIG. + * After this command NAN functions can be added. * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by * its %NL80211_ATTR_WDEV interface. * @NL80211_CMD_ADD_NAN_FUNCTION: Add a NAN function. The function is defined @@ -1115,6 +1116,10 @@ * current configuration is not changed. If it is present but * set to zero, the configuration is changed to don't-care * (i.e. the device can decide what to do). + * Additional parameters may be provided with + * %NL80211_ATTR_NAN_CONFIG. User space should provide all previously + * configured nested attributes under %NL80211_ATTR_NAN_CONFIG, even if + * only a subset was changed. * @NL80211_CMD_NAN_MATCH: Notification sent when a match is reported. * This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and * %NL80211_ATTR_COOKIE. @@ -2936,6 +2941,12 @@ enum nl80211_commands { * indicate that it wants strict checking on the BSS parameters to be * modified. * + * @NL80211_ATTR_NAN_CONFIG: Nested attribute for + * extended NAN cluster configuration. This is used with + * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG. + * See &enum nl80211_nan_conf_attributes for details. + * This attribute is optional. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3498,6 +3509,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_LONG_BEACON_PERIOD, NL80211_ATTR_S1G_SHORT_BEACON, NL80211_ATTR_BSS_PARAM, + NL80211_ATTR_NAN_CONFIG, /* add attributes here, update the policy in nl80211.c */ @@ -7323,6 +7335,100 @@ enum nl80211_nan_match_attributes { NL80211_NAN_MATCH_ATTR_MAX = NUM_NL80211_NAN_MATCH_ATTR - 1 }; +/** + * enum nl80211_nan_band_conf_attributes - NAN band configuration attributes + * @__NL80211_NAN_BAND_CONF_INVALID: Invalid. + * @NL80211_NAN_BAND_CONF_BAND: Band for which the configuration is + * being set. The value is according to &enum nl80211_band (u8). + * @NL80211_NAN_BAND_CONF_FREQ: Discovery frequency. This attribute shall not + * be present on 2.4 GHZ band. On 5 GHz band its presence is optional. + * The allowed values are 5220 (channel 44) or 5745 (channel 149). + * If not present, channel 149 is used if allowed, otherwise channel 44 + * will be selected. The value is in MHz (u16). + * @NL80211_NAN_BAND_CONF_RSSI_CLOSE: RSSI close threshold used for NAN state + * transition algorithm as described in chapters 3.3.6 and 3.3.7 "NAN + * Device Role and State Transition" of Wi-Fi Aware (TM) Specification + * v4.0. If not specified, default device value is used. The value should + * be greater than -60 dBm (s8). + * @NL80211_NAN_BAND_CONF_RSSI_MIDDLE: RSSI middle threshold used for NAN state + * transition algorithm as described in chapters 3.3.6 and 3.3.7 "NAN + * Device Role and State Transition" of Wi-Fi Aware (TM) Specification + * v4.0. If not present, default device value is used. The value should be + * greater than -75 dBm and less than %NL80211_NAN_BAND_CONF_RSSI_CLOSE + * (s8). + * @NL80211_NAN_BAND_CONF_WAKE_DW: Committed DW information (values 0-5). + * Value 0 means that the device will not wake up during the + * discovery window. Values 1-5 mean that the device will wake up + * during each 2^(n - 1) discovery window, where n is the value of + * this attribute. Setting this attribute to 0 is not allowed on + * 2.4 GHz band (u8). This is an optional parameter (default is 1). + * @NL80211_NAN_BAND_CONF_DISABLE_SCAN: Optional flag attribute to disable + * scanning (for cluster merge) on the band. If set, the device will not + * scan on this band anymore. Disabling scanning on 2.4 GHz band is not + * allowed. + * @NUM_NL80211_NAN_BAND_CONF_ATTR: Internal. + * @NL80211_NAN_BAND_CONF_ATTR_MAX: Highest NAN band configuration attribute. + * + * These attributes are used to configure NAN band-specific parameters. Note, + * that both RSSI attributes should be configured (or both left unset). + */ +enum nl80211_nan_band_conf_attributes { + __NL80211_NAN_BAND_CONF_INVALID, + NL80211_NAN_BAND_CONF_BAND, + NL80211_NAN_BAND_CONF_FREQ, + NL80211_NAN_BAND_CONF_RSSI_CLOSE, + NL80211_NAN_BAND_CONF_RSSI_MIDDLE, + NL80211_NAN_BAND_CONF_WAKE_DW, + NL80211_NAN_BAND_CONF_DISABLE_SCAN, + + /* keep last */ + NUM_NL80211_NAN_BAND_CONF_ATTR, + NL80211_NAN_BAND_CONF_ATTR_MAX = NUM_NL80211_NAN_BAND_CONF_ATTR - 1, +}; + +/** + * enum nl80211_nan_conf_attributes - NAN configuration attributes + * @__NL80211_NAN_CONF_INVALID: Invalid attribute, used for validation. + * @NL80211_NAN_CONF_CLUSTER_ID: ID for the NAN cluster. This is a MAC + * address that can take values from 50-6F-9A-01-00-00 to + * 50-6F-9A-01-FF-FF. This attribute is optional. If not present, + * a random Cluster ID will be chosen. + * @NL80211_NAN_CONF_EXTRA_ATTRS: Additional NAN attributes to be + * published in the beacons. This is an optional byte array. + * @NL80211_NAN_CONF_VENDOR_ELEMS: Vendor-specific elements that will + * be published in the beacons. This is an optional byte array. + * @NL80211_NAN_CONF_BAND_CONFIGS: This is a nested array attribute, + * containing multiple entries for each supported band. Each band + * configuration consists of &enum nl80211_nan_band_conf_attributes. + * @NL80211_NAN_CONF_SCAN_PERIOD: Scan period in seconds. If not configured, + * device default is used. Zero value will disable scanning. + * This is u16 (optional). + * @NL80211_NAN_CONF_SCAN_DWELL_TIME: Scan dwell time in TUs per channel. + * Only non-zero values are valid. If not configured the device default + * value is used. This is u16 (optional) + * @NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL: Discovery beacon interval + * in TUs. Valid range is 50-200 TUs. If not configured the device default + * value is used. This is u8 (optional) + * @NUM_NL80211_NAN_CONF_ATTR: Internal. + * @NL80211_NAN_CONF_ATTR_MAX: Highest NAN configuration attribute. + * + * These attributes are used to configure NAN-specific parameters. + */ +enum nl80211_nan_conf_attributes { + __NL80211_NAN_CONF_INVALID, + NL80211_NAN_CONF_CLUSTER_ID, + NL80211_NAN_CONF_EXTRA_ATTRS, + NL80211_NAN_CONF_VENDOR_ELEMS, + NL80211_NAN_CONF_BAND_CONFIGS, + NL80211_NAN_CONF_SCAN_PERIOD, + NL80211_NAN_CONF_SCAN_DWELL_TIME, + NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL, + + /* keep last */ + NUM_NL80211_NAN_CONF_ATTR, + NL80211_NAN_CONF_ATTR_MAX = NUM_NL80211_NAN_CONF_ATTR - 1, +}; + /** * enum nl80211_external_auth_action - Action to perform with external * authentication request. Used by NL80211_ATTR_EXTERNAL_AUTH_ACTION. diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b7bc7e5e81dd..04679acc8135 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -312,6 +312,26 @@ static int validate_supported_selectors(const struct nlattr *attr, return 0; } +static int validate_nan_cluster_id(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + static const u8 cluster_id_prefix[4] = {0x50, 0x6f, 0x9a, 0x1}; + + if (len != ETH_ALEN) { + NL_SET_ERR_MSG_ATTR(extack, attr, "bad cluster id length"); + return -EINVAL; + } + + if (memcmp(data, cluster_id_prefix, sizeof(cluster_id_prefix))) { + NL_SET_ERR_MSG_ATTR(extack, attr, "invalid cluster id prefix"); + return -EINVAL; + } + + return 0; +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -500,6 +520,35 @@ nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = { IEEE80211_MAX_DATA_LEN), }; +static const struct nla_policy +nl80211_nan_band_conf_policy[NL80211_NAN_BAND_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_BAND_CONF_BAND] = NLA_POLICY_MAX(NLA_U8, + NUM_NL80211_BANDS - 1), + [NL80211_NAN_BAND_CONF_FREQ] = { .type = NLA_U16 }, + [NL80211_NAN_BAND_CONF_RSSI_CLOSE] = NLA_POLICY_MIN(NLA_S8, -59), + [NL80211_NAN_BAND_CONF_RSSI_MIDDLE] = NLA_POLICY_MIN(NLA_S8, -74), + [NL80211_NAN_BAND_CONF_WAKE_DW] = NLA_POLICY_MAX(NLA_U8, 5), + [NL80211_NAN_BAND_CONF_DISABLE_SCAN] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy +nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_CONF_CLUSTER_ID] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_nan_cluster_id, + ETH_ALEN), + [NL80211_NAN_CONF_EXTRA_ATTRS] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN}, + [NL80211_NAN_CONF_VENDOR_ELEMS] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), + [NL80211_NAN_CONF_BAND_CONFIGS] = + NLA_POLICY_NESTED_ARRAY(nl80211_nan_band_conf_policy), + [NL80211_NAN_CONF_SCAN_PERIOD] = { .type = NLA_U16 }, + [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), + [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = + NLA_POLICY_RANGE(NLA_U8, 50, 200), +}; + static const struct netlink_range_validation nl80211_punct_bitmap_range = { .min = 0, .max = 0xffff, @@ -769,6 +818,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, + [NL80211_ATTR_NAN_CONFIG] = NLA_POLICY_NESTED(nl80211_nan_conf_policy), [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, @@ -15398,6 +15448,211 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info) return 0; } +static struct ieee80211_channel *nl80211_get_nan_channel(struct wiphy *wiphy, + int freq) +{ + struct ieee80211_channel *chan; + struct cfg80211_chan_def def; + + /* Check if the frequency is valid for NAN */ + if (freq != 5220 && freq != 5745 && freq != 2437) + return NULL; + + chan = ieee80211_get_channel(wiphy, freq); + if (!chan) + return NULL; + + cfg80211_chandef_create(&def, chan, NL80211_CHAN_NO_HT); + + /* Check if the channel is allowed */ + if (cfg80211_reg_can_beacon(wiphy, &def, NL80211_IFTYPE_NAN)) + return chan; + + return NULL; +} + +static int nl80211_parse_nan_band_config(struct wiphy *wiphy, + struct nlattr **tb, + struct cfg80211_nan_band_config *cfg, + enum nl80211_band band) +{ + if (BIT(band) & ~(u32)wiphy->nan_supported_bands) + return -EINVAL; + + if (tb[NL80211_NAN_BAND_CONF_FREQ]) { + u16 freq = nla_get_u16(tb[NL80211_NAN_BAND_CONF_FREQ]); + + if (band != NL80211_BAND_5GHZ) + return -EINVAL; + + cfg->chan = nl80211_get_nan_channel(wiphy, freq); + if (!cfg->chan) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]) { + cfg->rssi_close = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]); + if (!tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) { + cfg->rssi_middle = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]); + if (!cfg->rssi_close || cfg->rssi_middle >= cfg->rssi_close) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_WAKE_DW]) { + cfg->awake_dw_interval = + nla_get_u8(tb[NL80211_NAN_BAND_CONF_WAKE_DW]); + + if (band == NL80211_BAND_2GHZ && cfg->awake_dw_interval == 0) + return -EINVAL; + } + + cfg->disable_scan = + nla_get_flag(tb[NL80211_NAN_BAND_CONF_DISABLE_SCAN]); + return 0; +} + +static int nl80211_parse_nan_conf(struct wiphy *wiphy, + struct genl_info *info, + struct cfg80211_nan_conf *conf, + u32 *changed_flags) +{ + struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1]; + int err, rem; + u32 changed = 0; + struct nlattr *band_config; + + if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { + conf->master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + + changed |= CFG80211_NAN_CONF_CHANGED_PREF; + } + + if (info->attrs[NL80211_ATTR_BANDS]) { + u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); + + if (bands & ~(u32)wiphy->nan_supported_bands) + return -EOPNOTSUPP; + + if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) + return -EINVAL; + + conf->bands = bands; + changed |= CFG80211_NAN_CONF_CHANGED_BANDS; + } + + conf->band_cfgs[NL80211_BAND_2GHZ].awake_dw_interval = 1; + if (conf->bands & BIT(NL80211_BAND_5GHZ) || !conf->bands) + conf->band_cfgs[NL80211_BAND_5GHZ].awake_dw_interval = 1; + + /* On 2.4 GHz band use channel 6 */ + conf->band_cfgs[NL80211_BAND_2GHZ].chan = + nl80211_get_nan_channel(wiphy, 2437); + if (!conf->band_cfgs[NL80211_BAND_2GHZ].chan) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_NAN_CONFIG]) + goto out; + + err = nla_parse_nested(attrs, NL80211_NAN_CONF_ATTR_MAX, + info->attrs[NL80211_ATTR_NAN_CONFIG], NULL, + info->extack); + if (err) + return err; + + changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; + if (attrs[NL80211_NAN_CONF_CLUSTER_ID]) + conf->cluster_id = + nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); + + if (attrs[NL80211_NAN_CONF_EXTRA_ATTRS]) { + conf->extra_nan_attrs = + nla_data(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + conf->extra_nan_attrs_len = + nla_len(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + } + + if (attrs[NL80211_NAN_CONF_VENDOR_ELEMS]) { + conf->vendor_elems = + nla_data(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + conf->vendor_elems_len = + nla_len(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + } + + if (attrs[NL80211_NAN_CONF_BAND_CONFIGS]) { + nla_for_each_nested(band_config, + attrs[NL80211_NAN_CONF_BAND_CONFIGS], + rem) { + enum nl80211_band band; + struct cfg80211_nan_band_config *cfg; + struct nlattr *tb[NL80211_NAN_BAND_CONF_ATTR_MAX + 1]; + + err = nla_parse_nested(tb, + NL80211_NAN_BAND_CONF_ATTR_MAX, + band_config, NULL, + info->extack); + if (err) + return err; + + if (!tb[NL80211_NAN_BAND_CONF_BAND]) + return -EINVAL; + + band = nla_get_u8(tb[NL80211_NAN_BAND_CONF_BAND]); + if (conf->bands && !(conf->bands & BIT(band))) + return -EINVAL; + + cfg = &conf->band_cfgs[band]; + + err = nl80211_parse_nan_band_config(wiphy, tb, cfg, + band); + if (err) + return err; + } + } + + if (attrs[NL80211_NAN_CONF_SCAN_PERIOD]) + conf->scan_period = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_PERIOD]); + + if (attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]) + conf->scan_dwell_time = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]); + + if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) + conf->discovery_beacon_interval = + nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); +out: + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { + /* If no 5GHz channel is specified use default, if possible */ + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5745); + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan) + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5220); + + /* Return error if user space asked explicitly for 5 GHz */ + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + conf->bands & BIT(NL80211_BAND_5GHZ)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[NL80211_ATTR_BANDS], + "5 GHz band operation is not allowed"); + return -EINVAL; + } + } + + if (changed_flags) + *changed_flags = changed; + + return 0; +} + static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -15414,23 +15669,13 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; + /* Master preference is mandatory for START_NAN */ if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL); + if (err) + return err; err = rdev_start_nan(rdev, wdev, &conf); if (err) @@ -15786,6 +16031,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb, struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_nan_conf conf = {}; u32 changed = 0; + int err; if (wdev->iftype != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; @@ -15793,27 +16039,9 @@ static int nl80211_nan_change_config(struct sk_buff *skb, if (!wdev_running(wdev)) return -ENOTCONN; - if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - if (conf.master_pref <= 1 || conf.master_pref == 255) - return -EINVAL; - - changed |= CFG80211_NAN_CONF_CHANGED_PREF; - } - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - changed |= CFG80211_NAN_CONF_CHANGED_BANDS; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed); + if (err) + return err; if (!changed) return -EINVAL; -- cgit v1.2.3 From ba9b2ceaa2558a38a5da59fd654b641610a8568e Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:56 +0300 Subject: wifi: nl80211: Add NAN Discovery Window (DW) notification This notification will be used by the device to inform user space about upcoming DW. When received, user space will be able to prepare multicast Service Discovery Frames (SDFs) to be transmitted during the next DW using %NL80211_CMD_FRAME command on the NAN management interface. The device/driver will take care to transmit the frames in the correct timing. This allows to implement a synchronized Discovery Engine (DE) in user space, if the device doesn't support DE offload. Note that this notification can be sent before the actual DW starts as long as the driver/device handles the actual timing of the SDF transmission. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.0e1d15031bab.I5b1721e61b63910452b3c5cdcdc1e94cb094d4c9@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++++++++ include/uapi/linux/nl80211.h | 16 ++++++++++++++++ net/wireless/nl80211.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 16 ++++++++++++++++ 4 files changed, 89 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e2f4ca500ea3..0c1311d254be 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3959,6 +3959,8 @@ struct cfg80211_nan_band_config { * @scan_period: period (in seconds) between NAN scans. * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @enable_dw_notification: flag to enable/disable discovery window + * notifications. * @band_cfgs: array of band specific configurations, indexed by * &enum nl80211_band values. * @extra_nan_attrs: pointer to additional NAN attributes. @@ -3973,6 +3975,7 @@ struct cfg80211_nan_conf { u16 scan_period; u16 scan_dwell_time; u8 discovery_beacon_interval; + bool enable_dw_notification; struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; const u8 *extra_nan_attrs; u16 extra_nan_attrs_len; @@ -10062,6 +10065,15 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev); */ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); +/** + * cfg80211_next_nan_dw_notif - Notify about the next NAN Discovery Window (DW) + * @wdev: Pointer to the wireless device structure + * @chan: DW channel (6, 44 or 149) + * @gfp: Memory allocation flags + */ +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 20b8202a3d58..d674608e2635 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1349,6 +1349,15 @@ * control EPCS configuration. Used to notify userland on the current state * of EPCS. * + * @NL80211_CMD_NAN_NEXT_DW_NOTIFICATION: This command is used to notify + * user space about the next NAN Discovery Window (DW). User space may use + * it to prepare frames to be sent in the next DW. + * %NL80211_ATTR_WIPHY_FREQ is used to indicate the frequency of the next + * DW. SDF transmission should be requested with %NL80211_CMD_FRAME and + * the device/driver shall take care of the actual transmission timing. + * This notification is only sent to the NAN interface owning socket + * (see %NL80211_ATTR_SOCKET_OWNER flag). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1609,6 +1618,8 @@ enum nl80211_commands { NL80211_CMD_ASSOC_MLO_RECONF, NL80211_CMD_EPCS_CFG, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -7409,6 +7420,10 @@ enum nl80211_nan_band_conf_attributes { * @NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL: Discovery beacon interval * in TUs. Valid range is 50-200 TUs. If not configured the device default * value is used. This is u8 (optional) + * @NL80211_NAN_CONF_NOTIFY_DW: If set, the driver will notify userspace about + * the upcoming discovery window with + * %NL80211_CMD_NAN_NEXT_DW_NOTIFICATION. + * This is a flag attribute. * @NUM_NL80211_NAN_CONF_ATTR: Internal. * @NL80211_NAN_CONF_ATTR_MAX: Highest NAN configuration attribute. * @@ -7423,6 +7438,7 @@ enum nl80211_nan_conf_attributes { NL80211_NAN_CONF_SCAN_PERIOD, NL80211_NAN_CONF_SCAN_DWELL_TIME, NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL, + NL80211_NAN_CONF_NOTIFY_DW, /* keep last */ NUM_NL80211_NAN_CONF_ATTR, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 04679acc8135..d64145746b65 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -547,6 +547,7 @@ nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = NLA_POLICY_RANGE(NLA_U8, 50, 200), + [NL80211_NAN_CONF_NOTIFY_DW] = { .type = NLA_FLAG }, }; static const struct netlink_range_validation nl80211_punct_bitmap_range = { @@ -15627,6 +15628,11 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy, if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) conf->discovery_beacon_interval = nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); + + if (attrs[NL80211_NAN_CONF_NOTIFY_DW]) + conf->enable_dw_notification = + nla_get_flag(attrs[NL80211_NAN_CONF_NOTIFY_DW]); + out: if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { @@ -21764,6 +21770,45 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled) } EXPORT_SYMBOL(cfg80211_epcs_changed); +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_next_nan_dw_notif(wdev, chan); + + if (!wdev->owner_nlportid) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_unicast(wiphy_net(wiphy), msg, wdev->owner_nlportid); + + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 9b6074155d59..ff47e9bffd4f 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4166,6 +4166,22 @@ TRACE_EVENT(cfg80211_epcs_changed, WDEV_PR_ARG, __entry->enabled) ); +TRACE_EVENT(cfg80211_next_nan_dw_notif, + TP_PROTO(struct wireless_dev *wdev, + struct ieee80211_channel *chan), + TP_ARGS(wdev, chan), + TP_STRUCT__entry( + WDEV_ENTRY + CHAN_ENTRY + ), + TP_fast_assign( + WDEV_ASSIGN; + CHAN_ASSIGN(chan); + ), + TP_printk(WDEV_PR_FMT " " CHAN_PR_FMT, + WDEV_PR_ARG, CHAN_PR_ARG) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 1ccfd8db34fb3b1852284668094d7207499c2415 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:57 +0300 Subject: wifi: cfg80211: Add cluster joined notification APIs The drivers should notify upper layers and user space when a NAN device joins a cluster. This is needed, for example, to set the correct addr3 in SDF frames. Add API to report cluster join event. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.ad27b7b6e4d9.I70b213a2a49f18d1ba2ad325e67e8eff51cc7a1f@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++++++++++ include/uapi/linux/nl80211.h | 8 ++++++++ net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 19 +++++++++++++++++++ 4 files changed, 82 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0c1311d254be..1b10bd31bdd6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -10074,6 +10074,20 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, struct ieee80211_channel *chan, gfp_t gfp); +/** + * cfg80211_nan_cluster_joined - Notify about NAN cluster join + * @wdev: Pointer to the wireless device structure + * @cluster_id: Cluster ID of the NAN cluster that was joined or started + * @new_cluster: Indicates if this is a new cluster or an existing one + * @gfp: Memory allocation flags + * + * This function is used to notify user space when a NAN cluster has been + * joined, providing the cluster ID and a flag whether it is a new cluster. + */ +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d674608e2635..c5a7658b7297 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1357,6 +1357,9 @@ * the device/driver shall take care of the actual transmission timing. * This notification is only sent to the NAN interface owning socket * (see %NL80211_ATTR_SOCKET_OWNER flag). + * @NL80211_CMD_NAN_CLUSTER_JOINED: This command is used to notify + * user space that the NAN new cluster has been joined. The cluster ID is + * indicated by %NL80211_ATTR_MAC. * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -1619,6 +1622,7 @@ enum nl80211_commands { NL80211_CMD_EPCS_CFG, NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, + NL80211_CMD_NAN_CLUSTER_JOINED, /* add new commands above here */ @@ -2957,6 +2961,9 @@ enum nl80211_commands { * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG. * See &enum nl80211_nan_conf_attributes for details. * This attribute is optional. + * @NL80211_ATTR_NAN_NEW_CLUSTER: Flag attribute indicating that a new + * NAN cluster has been created. This is used with + * %NL80211_CMD_NAN_CLUSTER_JOINED * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -3521,6 +3528,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_SHORT_BEACON, NL80211_ATTR_BSS_PARAM, NL80211_ATTR_NAN_CONFIG, + NL80211_ATTR_NAN_NEW_CLUSTER, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d64145746b65..904a725a4f4a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -21809,6 +21809,47 @@ void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, } EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_CLUSTER_JOINED); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cluster_id) || + (new_cluster && nla_put_flag(msg, NL80211_ATTR_NAN_NEW_CLUSTER))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(wiphy), msg, + wdev->owner_nlportid); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_cluster_joined); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index ff47e9bffd4f..8a4c34112eb5 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4182,6 +4182,25 @@ TRACE_EVENT(cfg80211_next_nan_dw_notif, WDEV_PR_ARG, CHAN_PR_ARG) ); +TRACE_EVENT(cfg80211_nan_cluster_joined, + TP_PROTO(struct wireless_dev *wdev, + const u8 *cluster_id, + bool new_cluster), + TP_ARGS(wdev, cluster_id, new_cluster), + TP_STRUCT__entry( + WDEV_ENTRY + MAC_ENTRY(cluster_id) + __field(bool, new_cluster) + ), + TP_fast_assign( + WDEV_ASSIGN; + MAC_ASSIGN(cluster_id, cluster_id); + __entry->new_cluster = new_cluster; + ), + TP_printk(WDEV_PR_FMT " cluster_id %pMF%s", + WDEV_PR_ARG, __entry->cluster_id, + __entry->new_cluster ? " [new]" : "") +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From b9c3d426c8a5823b3a1e5078719750c6abb0d2c1 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:12:59 +0300 Subject: wifi: cfg80211: Advertise supported NAN capabilities Allow drivers to specify the supported NAN capabilities and support advertising the NAN capabilities to user space. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.2976966556f5.Ic6e43b10049573180c909dad806f279cfb31143e@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 17 +++++++++++++++++ include/net/cfg80211.h | 38 ++++++++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) (limited to 'include/net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d350263f23f3..2110345de8ef 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -6065,4 +6065,21 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) _data + ieee80211_mle_common_size(_data),\ _len - ieee80211_mle_common_size(_data)) +/* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */ +#define NAN_OP_MODE_PHY_MODE_VHT 0x01 +#define NAN_OP_MODE_PHY_MODE_HE 0x10 +#define NAN_OP_MODE_PHY_MODE_MASK 0x11 +#define NAN_OP_MODE_80P80MHZ 0x02 +#define NAN_OP_MODE_160MHZ 0x04 +#define NAN_OP_MODE_PNDL_SUPPRTED 0x08 + +/* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification + * Table 79 + */ +#define NAN_DEV_CAPA_DFS_OWNER 0x01 +#define NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED 0x02 +#define NAN_DEV_CAPA_SIM_NDP_RX_SUPPORTED 0x04 +#define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 +#define NAN_DEV_CAPA_S3_SUPPORTED 0x10 + #endif /* LINUX_IEEE80211_H */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1b10bd31bdd6..e30c1886c530 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5711,6 +5711,42 @@ struct wiphy_radio { u32 antenna_mask; }; +/** + * enum wiphy_nan_flags - NAN capabilities + * + * @WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC: Device supports NAN configurable + * synchronization. + * @WIPHY_NAN_FLAGS_USERSPACE_DE: Device doesn't support DE offload. + */ +enum wiphy_nan_flags { + WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC = BIT(0), + WIPHY_NAN_FLAGS_USERSPACE_DE = BIT(1), +}; + +/** + * struct wiphy_nan_capa - NAN capabilities + * + * This structure describes the NAN capabilities of a wiphy. + * + * @flags: NAN capabilities flags, see &enum wiphy_nan_flags + * @op_mode: NAN operation mode, as defined in Wi-Fi Aware (TM) specification + * Table 81. + * @n_antennas: number of antennas supported by the device for Tx/Rx. Lower + * nibble indicates the number of TX antennas and upper nibble indicates the + * number of RX antennas. Value 0 indicates the information is not + * available. + * @max_channel_switch_time: maximum channel switch time in milliseconds. + * @dev_capabilities: NAN device capabilities as defined in Wi-Fi Aware (TM) + * specification Table 79 (Capabilities field). + */ +struct wiphy_nan_capa { + u32 flags; + u8 op_mode; + u8 n_antennas; + u16 max_channel_switch_time; + u8 dev_capabilities; +}; + #define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff /** @@ -5884,6 +5920,7 @@ struct wiphy_radio { * bitmap of &enum nl80211_band values. For instance, for * NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @nan_capa: NAN capabilities * * @txq_limit: configuration of internal TX queue frame limit * @txq_memory_limit: configuration internal TX queue memory limit @@ -6065,6 +6102,7 @@ struct wiphy { u32 bss_select_support; u8 nan_supported_bands; + struct wiphy_nan_capa nan_capa; u32 txq_limit; u32 txq_memory_limit; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 904a725a4f4a..bcd18ae59e84 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2605,6 +2605,41 @@ fail: return -ENOBUFS; } +static int nl80211_put_nan_capa(struct wiphy *wiphy, struct sk_buff *msg) +{ + struct nlattr *nan_caps; + + nan_caps = nla_nest_start(msg, NL80211_ATTR_NAN_CAPABILITIES); + if (!nan_caps) + return -ENOBUFS; + + if (wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC && + nla_put_flag(msg, NL80211_NAN_CAPA_CONFIGURABLE_SYNC)) + goto fail; + + if ((wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE) && + nla_put_flag(msg, NL80211_NAN_CAPA_USERSPACE_DE)) + goto fail; + + if (nla_put_u8(msg, NL80211_NAN_CAPA_OP_MODE, + wiphy->nan_capa.op_mode) || + nla_put_u8(msg, NL80211_NAN_CAPA_NUM_ANTENNAS, + wiphy->nan_capa.n_antennas) || + nla_put_u16(msg, NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME, + wiphy->nan_capa.max_channel_switch_time) || + nla_put_u8(msg, NL80211_NAN_CAPA_CAPABILITIES, + wiphy->nan_capa.dev_capabilities)) + goto fail; + + nla_nest_end(msg, nan_caps); + + return 0; + +fail: + nla_nest_cancel(msg, nan_caps); + return -ENOBUFS; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -3257,6 +3292,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_put_radios(&rdev->wiphy, msg)) goto nla_put_failure; + state->split_start++; + break; + case 18: + if (nl80211_put_nan_capa(&rdev->wiphy, msg)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; -- cgit v1.2.3 From 1884e2594b084a6b1eb438e5eda586f284d80fee Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:01 +0300 Subject: wifi: cfg80211: Store the NAN cluster ID When the driver indicates that the device has joined a cluster, store the cluster ID. This is needed for data path operations, e.g., filtering received frames etc. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.63e9fef2a3aa.I6c858185c9e71f84bd2c5174d7ee45902b4391c3@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ net/wireless/nl80211.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e30c1886c530..26fd42e189ce 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6681,6 +6681,9 @@ struct wireless_dev { struct { struct cfg80211_chan_def chandef; } ocb; + struct { + u8 cluster_id[ETH_ALEN] __aligned(2); + } nan; } u; struct { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 72f68a17c92b..4e0d40865441 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -21865,6 +21865,8 @@ void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster); + memcpy(wdev->u.nan.cluster_id, cluster_id, ETH_ALEN); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; -- cgit v1.2.3 From fc41f4a28ac4d462487903229494eeb266f68a40 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:02 +0300 Subject: wifi: mac80211: Support Tx of action frame for NAN Add support for sending management frame over a NAN Device interface: - Declare support for the supported management frames types. - Since action frame transmissions over a NAN Device interface do not necessarily require a channel configuration, e.g., they can be transmitted during DW, modify the Tx path to avoid accessing channel information for NAN Device interface. - In addition modify the points in the Tx path logic to account for cases that a band is not specified in the Tx information. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.23b160089228.I65a58af753bcbcfb5c4ad8ef372d546f889725ba@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ net/mac80211/main.c | 5 +++++ net/mac80211/offchannel.c | 5 ++++- net/mac80211/rate.c | 11 ++++++++++- net/mac80211/tx.c | 12 ++++++++++-- 5 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a45e4bee65d4..a5140ecf334b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3192,6 +3192,10 @@ ieee80211_get_tx_rate(const struct ieee80211_hw *hw, { if (WARN_ON_ONCE(c->control.rates[0].idx < 0)) return NULL; + + if (c->band >= NUM_NL80211_BANDS) + return NULL; + return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[0].idx]; } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 437f1363c982..27b3ec5deabe 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -746,6 +746,11 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | BIT(IEEE80211_STYPE_AUTH >> 4), }, + [NL80211_IFTYPE_NAN] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4), + }, }; static const struct ieee80211_ht_cap mac80211_ht_capa_mod_mask = { diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 13df6321634d..ae82533e3c02 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -8,7 +8,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2009 Johannes Berg - * Copyright (C) 2019, 2022-2024 Intel Corporation + * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include #include @@ -897,6 +897,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, need_offchan = true; break; case NL80211_IFTYPE_NAN: + break; default: return -EOPNOTSUPP; } @@ -910,6 +911,8 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, /* Check if the operating channel is the requested channel */ if (!params->chan && mlo_sta) { need_offchan = false; + } else if (sdata->vif.type == NL80211_IFTYPE_NAN) { + /* Frames can be sent during NAN schedule */ } else if (!need_offchan) { struct ieee80211_chanctx_conf *chanctx_conf = NULL; int i; diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 3cb2ad6d0b28..e441f8541603 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -4,7 +4,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2019, 2022-2024 Intel Corporation + * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include @@ -98,6 +98,9 @@ void rate_control_tx_status(struct ieee80211_local *local, if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; + if (st->info->band >= NUM_NL80211_BANDS) + return; + sband = local->hw.wiphy->bands[st->info->band]; spin_lock_bh(&sta->rate_ctrl_lock); @@ -419,6 +422,9 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; + if (!sband) + return false; + if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); @@ -898,6 +904,9 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, return; sdata = vif_to_sdata(vif); + if (info->band >= NUM_NL80211_BANDS) + return; + sband = sdata->local->hw.wiphy->bands[info->band]; if (ieee80211_is_tx_data(skb)) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a27e2af5d569..ba51198be94a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -59,6 +59,9 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; + if (info->band >= NUM_NL80211_BANDS) + return 0; + sband = local->hw.wiphy->bands[info->band]; txrate = &sband->bitrates[tx->rate.idx]; @@ -683,7 +686,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) memset(&txrc, 0, sizeof(txrc)); - sband = tx->local->hw.wiphy->bands[info->band]; + if (info->band < NUM_NL80211_BANDS) + sband = tx->local->hw.wiphy->bands[info->band]; + else + return TX_CONTINUE; len = min_t(u32, tx->skb->len + FCS_LEN, tx->local->hw.wiphy->frag_threshold); @@ -6288,7 +6294,9 @@ void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, enum nl80211_band band; rcu_read_lock(); - if (!ieee80211_vif_is_mld(&sdata->vif)) { + if (sdata->vif.type == NL80211_IFTYPE_NAN) { + band = NUM_NL80211_BANDS; + } else if (!ieee80211_vif_is_mld(&sdata->vif)) { WARN_ON(link_id >= 0); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); -- cgit v1.2.3 From 04f17cfea2442ef2ed01da7ba1f686a58a50048e Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:06 +0300 Subject: wifi: mac80211: Export an API to check if NAN is started So it can be used by drivers to check if NAN Device interface is started or not. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.c69652f77eb6.Ie4f3d197e0706e742e3d97614fadc11b22adfbc6@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ net/mac80211/util.c | 8 ++++++++ 2 files changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a5140ecf334b..a55085cf4ec4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7838,4 +7838,10 @@ int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, int n_vifs, enum ieee80211_chanctx_switch_mode mode); +/** + * ieee80211_vif_nan_started - Return whether a NAN vif is started + * @vif: the vif + * Return: %true iff the vif is a NAN interface and NAN is started + */ +bool ieee80211_vif_nan_started(struct ieee80211_vif *vif); #endif /* MAC80211_H */ diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 9eb35e3b9e52..123842b841f2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -4512,3 +4512,11 @@ void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe) sizeof(tpe->psd_reg_client[i].power)); } } + +bool ieee80211_vif_nan_started(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + return vif->type == NL80211_IFTYPE_NAN && sdata->u.nan.started; +} +EXPORT_SYMBOL_GPL(ieee80211_vif_nan_started); -- cgit v1.2.3 From d0688dc2b172d19e20fdb8be8c37930da12aaf88 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:11 +1000 Subject: wifi: cfg80211: correctly implement and validate S1G chandef Currently, the S1G channelisation implementation differs from that of VHT, which is the PHY that S1G is based on. The major difference between the clock rate is 1/10th of VHT. However how their channelisation is represented within cfg80211 and mac80211 vastly differ. To rectify this, remove the use of IEEE80211_CHAN_1/2/4.. flags that were previously used to indicate the control channel width, however it should be implied that the control channels are 1MHz in the case of S1G. Additionally, introduce the invert - being IEEE80211_CHAN_NO_4/8/16MHz - that imply the control channel may not be used for a certain bandwidth. With these new flags, we can perform regulatory and chandef validation just as we would for VHT. To deal with the notion that S1G PHYs may contain a 2MHz primary channel, introduce a new variable, s1g_primary_2mhz, which indicates whether we are operating on a 2MHz primary channel. In this case, the chandef::chan points to the 1MHz primary channel pointed to by the primary channel location. Alongside this, introduce some new helper routines that can extract the sibling 1MHz channel. The sibling being the alternate 1MHz primary subchannel within the 2MHz primary channel that is not pointed to by chandef::chan. Furthermore, due to unique restrictions imposed on S1G PHYs, introduce a new flag, IEEE80211_CHAN_S1G_NO_PRIMARY, which states that the 1MHz channel cannot be used as a primary channel. This is assumed to be set by vendors as it is hardware and regdom specific, When we validate a 2MHz primary channel, we need to ensure both 1MHz subchannels do not contain this flag. If one or both of the 1MHz subchannels contain this flag then the 2MHz primary is not permitted for use as a primary channel. Properly integrate S1G channel validation such that it is implemented according with other PHY types such as VHT. Additionally, implement a new S1G-specific regulatory flag to allow cfg80211 to understand specific vendor requirements for S1G PHYs. Signed-off-by: Arien Judge Signed-off-by: Andrew Pope Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-2-lachlan.hodges@morsemicro.com [remove redundant NL80211_ATTR_S1G_PRIMARY_2MHZ check] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 95 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 15 +++++++ net/wireless/chan.c | 103 +++++++++++++++++++++++++++++-------------- net/wireless/nl80211.c | 37 +++++++++------- net/wireless/reg.c | 76 ++++++++++--------------------- 5 files changed, 225 insertions(+), 101 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 26fd42e189ce..2d612c760dd1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -129,6 +129,13 @@ struct wiphy; * with very low power (VLP), even if otherwise set to NO_IR. * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel, * even if otherwise set to NO_IR. + * @IEEE80211_CHAN_S1G_NO_PRIMARY: Prevents the channel for use as an S1G + * primary channel. Does not prevent the wider operating channel + * described by the chandef from being used. In order for a 2MHz primary + * to be used, both 1MHz subchannels shall not contain this flag. + * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = BIT(0), @@ -158,6 +165,10 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_CAN_MONITOR = BIT(24), IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP = BIT(25), IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY = BIT(26), + IEEE80211_CHAN_S1G_NO_PRIMARY = BIT(27), + IEEE80211_CHAN_NO_4MHZ = BIT(28), + IEEE80211_CHAN_NO_8MHZ = BIT(29), + IEEE80211_CHAN_NO_16MHZ = BIT(30), }; #define IEEE80211_CHAN_NO_HT40 \ @@ -821,6 +832,9 @@ struct key_params { * @punctured: mask of the punctured 20 MHz subchannels, with * bits turned on being disabled (punctured); numbered * from lower to higher frequency (like in the spec) + * @s1g_primary_2mhz: Indicates if the control channel pointed to + * by 'chan' exists as a 1MHz primary subchannel within an + * S1G 2MHz primary channel. */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -830,6 +844,7 @@ struct cfg80211_chan_def { struct ieee80211_edmg edmg; u16 freq1_offset; u16 punctured; + bool s1g_primary_2mhz; }; /* @@ -990,6 +1005,18 @@ cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef) return chandef->edmg.channels || chandef->edmg.bw_config; } +/** + * cfg80211_chandef_is_s1g - check if chandef represents an S1G channel + * @chandef: the channel definition + * + * Return: %true if S1G. + */ +static inline bool +cfg80211_chandef_is_s1g(const struct cfg80211_chan_def *chandef) +{ + return chandef->chan->band == NL80211_BAND_S1GHZ; +} + /** * cfg80211_chandef_compatible - check if two channel definitions are compatible * @chandef1: first channel definition @@ -10179,4 +10206,72 @@ ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, void *data); #endif +/** + * cfg80211_s1g_get_start_freq_khz - get S1G chandef start frequency + * @chandef: the chandef to use + * + * Return: the chandefs starting frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_start_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz - bw_mhz * 500 + 500; +} + +/** + * cfg80211_s1g_get_end_freq_khz - get S1G chandef end frequency + * @chandef: the chandef to use + * + * Return: the chandefs ending frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_end_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz + bw_mhz * 500 - 500; +} + +/** + * cfg80211_s1g_get_primary_sibling - retrieve the sibling 1MHz subchannel + * for an S1G chandef using a 2MHz primary channel. + * @wiphy: wiphy the channel belongs to + * @chandef: the chandef to use + * + * When chandef::s1g_primary_2mhz is set to true, we are operating on a 2MHz + * primary channel. The 1MHz subchannel designated by the primary channel + * location exists within chandef::chan, whilst the 'sibling' is denoted as + * being the other 1MHz subchannel that make up the 2MHz primary channel. + * + * Returns: the sibling 1MHz &struct ieee80211_channel, or %NULL on failure. + */ +static inline struct ieee80211_channel * +cfg80211_s1g_get_primary_sibling(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 pri_1mhz_khz, sibling_1mhz_khz, op_low_1mhz_khz, pri_index; + + if (!chandef->s1g_primary_2mhz || width_mhz < 2) + return NULL; + + pri_1mhz_khz = ieee80211_channel_to_khz(chandef->chan); + op_low_1mhz_khz = cfg80211_s1g_get_start_freq_khz(chandef); + + /* + * Compute the index of the primary 1 MHz subchannel within the + * operating channel, relative to the lowest 1 MHz center frequency. + * Flip the least significant bit to select the even/odd sibling, + * then translate that index back into a channel frequency. + */ + pri_index = (pri_1mhz_khz - op_low_1mhz_khz) / 1000; + sibling_1mhz_khz = op_low_1mhz_khz + ((pri_index ^ 1) * 1000); + + return ieee80211_get_channel_khz(wiphy, sibling_1mhz_khz); +} + #endif /* __NET_CFG80211_H */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 423e258cdbd2..8134f10e4e6c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2969,6 +2969,10 @@ enum nl80211_commands { * capabilities supported by the driver. See &enum nl80211_nan_capabilities * for details. * + * @NL80211_ATTR_S1G_PRIMARY_2MHZ: flag attribute indicating that the S1G + * primary channel is 2 MHz wide, and the control channel designates + * the 1 MHz primary subchannel within that 2 MHz primary. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3535,6 +3539,8 @@ enum nl80211_attrs { NL80211_ATTR_NAN_NEW_CLUSTER, NL80211_ATTR_NAN_CAPABILITIES, + NL80211_ATTR_S1G_PRIMARY_2MHZ, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4432,6 +4438,12 @@ enum nl80211_wmm_rule { * very low power (VLP) AP, despite being NO_IR. * @NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY: This channel can be active in * 20 MHz bandwidth, despite being NO_IR. + * @NL80211_FREQUENCY_ATTR_NO_4MHZ: 4 MHz operation is not allowed on this + * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_8MHZ: 8 MHz operation is not allowed on this + * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_16MHZ: 16 MHz operation is not allowed on this + * channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4477,6 +4489,9 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_CAN_MONITOR, NL80211_FREQUENCY_ATTR_ALLOW_6GHZ_VLP_AP, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY, + NL80211_FREQUENCY_ATTR_NO_4MHZ, + NL80211_FREQUENCY_ATTR_NO_8MHZ, + NL80211_FREQUENCY_ATTR_NO_16MHZ, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 193734b7f9dc..68221b1ab45e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -100,6 +100,11 @@ static u32 cfg80211_get_end_freq(const struct cfg80211_chan_def *chandef, punctured = 0) : (punctured >>= 1))) \ if (!(punctured & 1)) +#define for_each_s1g_subchan(chandef, freq_khz) \ + for (freq_khz = cfg80211_s1g_get_start_freq_khz(chandef); \ + freq_khz <= cfg80211_s1g_get_end_freq_khz(chandef); \ + freq_khz += MHZ_TO_KHZ(1)) + struct cfg80211_per_bw_puncturing_values { u8 len; const u16 *valid_values; @@ -336,8 +341,7 @@ static bool cfg80211_valid_center_freq(u32 center, bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) { - u32 control_freq, oper_freq; - int oper_width, control_width; + u32 control_freq, control_freq_khz, start_khz, end_khz; if (!chandef->chan) return false; @@ -363,27 +367,16 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: - if (chandef->chan->band != NL80211_BAND_S1GHZ) - return false; - - control_freq = ieee80211_channel_to_khz(chandef->chan); - oper_freq = ieee80211_chandef_to_khz(chandef); - control_width = nl80211_chan_width_to_mhz( - ieee80211_s1g_channel_width( - chandef->chan)); - oper_width = cfg80211_chandef_get_width(chandef); - - if (oper_width < 0 || control_width < 0) + if (!cfg80211_chandef_is_s1g(chandef)) return false; if (chandef->center_freq2) return false; - if (control_freq + MHZ_TO_KHZ(control_width) / 2 > - oper_freq + MHZ_TO_KHZ(oper_width) / 2) - return false; + control_freq_khz = ieee80211_channel_to_khz(chandef->chan); + start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + end_khz = cfg80211_s1g_get_end_freq_khz(chandef); - if (control_freq - MHZ_TO_KHZ(control_width) / 2 < - oper_freq - MHZ_TO_KHZ(oper_width) / 2) + if (control_freq_khz < start_khz || control_freq_khz > end_khz) return false; break; case NL80211_CHAN_WIDTH_80P80: @@ -461,6 +454,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) !cfg80211_edmg_chandef_valid(chandef)) return false; + if (!cfg80211_chandef_is_s1g(chandef) && chandef->s1g_primary_2mhz) + return false; + return valid_puncturing_bitmap(chandef); } EXPORT_SYMBOL(cfg80211_chandef_valid); @@ -725,6 +721,10 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, { struct ieee80211_channel *c; + /* DFS is not required for S1G */ + if (cfg80211_chandef_is_s1g(chandef)) + return 0; + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); if (!c) @@ -1130,6 +1130,55 @@ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels, return true; } +static bool cfg80211_s1g_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + u32 freq_khz; + const struct ieee80211_channel *chan; + u32 pri_khz = ieee80211_channel_to_khz(chandef->chan); + u32 end_khz = cfg80211_s1g_get_end_freq_khz(chandef); + u32 start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 prohibited_flags = IEEE80211_CHAN_DISABLED; + + if (width_mhz >= 16) + prohibited_flags |= IEEE80211_CHAN_NO_16MHZ; + if (width_mhz >= 8) + prohibited_flags |= IEEE80211_CHAN_NO_8MHZ; + if (width_mhz >= 4) + prohibited_flags |= IEEE80211_CHAN_NO_4MHZ; + + if (chandef->chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + if (pri_khz < start_khz || pri_khz > end_khz) + return false; + + for_each_s1g_subchan(chandef, freq_khz) { + chan = ieee80211_get_channel_khz(wiphy, freq_khz); + if (!chan || (chan->flags & prohibited_flags)) + return false; + } + + if (chandef->s1g_primary_2mhz) { + u32 sib_khz; + const struct ieee80211_channel *sibling; + + sibling = cfg80211_s1g_get_primary_sibling(wiphy, chandef); + if (!sibling) + return false; + + if (sibling->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + sib_khz = ieee80211_channel_to_khz(sibling); + if (sib_khz < start_khz || sib_khz > end_khz) + return false; + } + + return true; +} + bool _cfg80211_chandef_usable(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, u32 prohibited_flags, @@ -1154,6 +1203,9 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) & IEEE80211_VHT_EXT_NSS_BW_CAPABLE; + if (cfg80211_chandef_is_s1g(chandef)) + return cfg80211_s1g_usable(wiphy, chandef); + if (edmg_cap->channels && !cfg80211_edmg_usable(wiphy, chandef->edmg.channels, @@ -1165,21 +1217,6 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { - case NL80211_CHAN_WIDTH_1: - width = 1; - break; - case NL80211_CHAN_WIDTH_2: - width = 2; - break; - case NL80211_CHAN_WIDTH_4: - width = 4; - break; - case NL80211_CHAN_WIDTH_8: - width = 8; - break; - case NL80211_CHAN_WIDTH_16: - width = 16; - break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4e0d40865441..de34a1d14073 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -931,6 +931,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, + [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1319,6 +1320,15 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, nla_put_flag(msg, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_4MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_4MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_8MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_8MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -3541,6 +3551,7 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, chandef->center_freq1 = KHZ_TO_MHZ(control_freq); chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; + chandef->s1g_primary_2mhz = false; if (!chandef->chan) { NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ], @@ -3584,27 +3595,20 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, return -EINVAL; } } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { - chandef->width = - nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (chandef->chan->band == NL80211_BAND_S1GHZ) { - /* User input error for channel width doesn't match channel */ - if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) { - NL_SET_ERR_MSG_ATTR(extack, - attrs[NL80211_ATTR_CHANNEL_WIDTH], - "bad channel width"); - return -EINVAL; - } - } + chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); - chandef->freq1_offset = - nla_get_u32_default(attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], - 0); + chandef->freq1_offset = nla_get_u32_default( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], 0); } + if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); + + chandef->s1g_primary_2mhz = nla_get_flag( + attrs[NL80211_ATTR_S1G_PRIMARY_2MHZ]); } if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) { @@ -10455,8 +10459,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; } - /* ignore disabled channels */ + /* Ignore disabled / no primary channels */ if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; @@ -10478,6 +10483,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & + IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 3b0ac3437f81..73cab51f6379 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1707,6 +1707,16 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + if (is_s1g) { + if (max_bandwidth_khz < MHZ_TO_KHZ(16)) + bw_flags |= IEEE80211_CHAN_NO_16MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(8)) + bw_flags |= IEEE80211_CHAN_NO_8MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(4)) + bw_flags |= IEEE80211_CHAN_NO_4MHZ; + return bw_flags; + } + /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, @@ -1717,59 +1727,19 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (is_s1g) { - /* S1G is strict about non overlapping channels. We can - * calculate which bandwidth is allowed per channel by finding - * the largest bandwidth which cleanly divides the freq_range. - */ - int edge_offset; - int ch_bw = max_bandwidth_khz; - - while (ch_bw) { - edge_offset = (center_freq_khz - ch_bw / 2) - - freq_range->start_freq_khz; - if (edge_offset % ch_bw == 0) { - switch (KHZ_TO_MHZ(ch_bw)) { - case 1: - bw_flags |= IEEE80211_CHAN_1MHZ; - break; - case 2: - bw_flags |= IEEE80211_CHAN_2MHZ; - break; - case 4: - bw_flags |= IEEE80211_CHAN_4MHZ; - break; - case 8: - bw_flags |= IEEE80211_CHAN_8MHZ; - break; - case 16: - bw_flags |= IEEE80211_CHAN_16MHZ; - break; - default: - /* If we got here, no bandwidths fit on - * this frequency, ie. band edge. - */ - bw_flags |= IEEE80211_CHAN_DISABLED; - break; - } - break; - } - ch_bw /= 2; - } - } else { - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; - if (max_bandwidth_khz < MHZ_TO_KHZ(80)) - bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(160)) - bw_flags |= IEEE80211_CHAN_NO_160MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(320)) - bw_flags |= IEEE80211_CHAN_NO_320MHZ; - } + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags |= IEEE80211_CHAN_NO_HT40; + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(320)) + bw_flags |= IEEE80211_CHAN_NO_320MHZ; + return bw_flags; } -- cgit v1.2.3 From cbcd507f01deb983d5cad0a25b6495930ab59593 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:13 +1000 Subject: wifi: cfg80211: remove ieee80211_s1g_channel_width With the introduction of proper S1G channel flags, this function is no longer used. Remove it. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-4-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 ---------- net/wireless/util.c | 27 --------------------------- 2 files changed, 37 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2d612c760dd1..1c041ce7a03b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6818,16 +6818,6 @@ ieee80211_channel_to_khz(const struct ieee80211_channel *chan) return MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset; } -/** - * ieee80211_s1g_channel_width - get allowed channel width from @chan - * - * Only allowed for band NL80211_BAND_S1GHZ - * @chan: channel - * Return: The allowed channel width for this center_freq - */ -enum nl80211_chan_width -ieee80211_s1g_channel_width(const struct ieee80211_channel *chan); - /** * ieee80211_channel_to_freq_khz - convert channel number to frequency * @chan: channel number diff --git a/net/wireless/util.c b/net/wireless/util.c index d12d49134c88..f26440d18ad3 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -106,33 +106,6 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); -enum nl80211_chan_width -ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) -{ - if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) - return NL80211_CHAN_WIDTH_20_NOHT; - - /*S1G defines a single allowed channel width per channel. - * Extract that width here. - */ - if (chan->flags & IEEE80211_CHAN_1MHZ) - return NL80211_CHAN_WIDTH_1; - else if (chan->flags & IEEE80211_CHAN_2MHZ) - return NL80211_CHAN_WIDTH_2; - else if (chan->flags & IEEE80211_CHAN_4MHZ) - return NL80211_CHAN_WIDTH_4; - else if (chan->flags & IEEE80211_CHAN_8MHZ) - return NL80211_CHAN_WIDTH_8; - else if (chan->flags & IEEE80211_CHAN_16MHZ) - return NL80211_CHAN_WIDTH_16; - - pr_err("unknown channel width for channel at %dKHz?\n", - ieee80211_channel_to_khz(chan)); - - return NL80211_CHAN_WIDTH_1; -} -EXPORT_SYMBOL(ieee80211_s1g_channel_width); - int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ -- cgit v1.2.3 From d7afdf889561058068ab46fd8f306c70ef29216a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:49 +0200 Subject: ns: add to__ns() to respective headers Every namespace type has a container_of(ns, , ns) static inline function that is currently not exposed in the header. So we have a bunch of places that open-code it via container_of(). Move it to the headers so we can use it directly. Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 5 +++++ include/linux/ipc_namespace.h | 5 +++++ include/linux/pid_namespace.h | 5 +++++ include/linux/time_namespace.h | 4 ++++ include/linux/user_namespace.h | 5 +++++ include/linux/utsname.h | 5 +++++ include/net/net_namespace.h | 5 +++++ ipc/namespace.c | 5 ----- kernel/cgroup/namespace.c | 5 ----- kernel/pid_namespace.c | 5 ----- kernel/time/namespace.c | 5 ----- kernel/user_namespace.c | 5 ----- kernel/utsname.c | 5 ----- net/core/net_namespace.c | 5 ----- 14 files changed, 34 insertions(+), 35 deletions(-) (limited to 'include/net') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..9ca25346f7cb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -794,6 +794,11 @@ extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611a..924e4754374f 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,6 +129,11 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) +static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) +{ + return container_of(ns, struct ipc_namespace, ns); +} + extern struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..ba0efc8c8596 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -54,6 +54,11 @@ extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 7f6af7a9771e..a47a4ce4183e 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,10 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +static inline struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a0bb6d012137..a09056ad090e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -168,6 +168,11 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412b..5d34c4f0f945 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -30,6 +30,11 @@ struct uts_namespace { extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + static inline void get_uts_ns(struct uts_namespace *ns) { refcount_inc(&ns->ns.count); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275..fd090ceb80bf 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -262,6 +262,11 @@ void ipx_unregister_sysctl(void); #ifdef CONFIG_NET_NS void __put_net(struct net *net); +static inline struct net *to_net_ns(struct ns_common *ns) +{ + return container_of(ns, struct net, ns); +} + /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { diff --git a/ipc/namespace.c b/ipc/namespace.c index 9f923c1a1eb3..89588819956b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -209,11 +209,6 @@ void put_ipc_ns(struct ipc_namespace *ns) } } -static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) -{ - return container_of(ns, struct ipc_namespace, ns); -} - static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index fc12c416dfeb..5a327914b565 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -89,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 228ae20299f9..9b327420309e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -345,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 408f60d0a3b6..20b65f90549e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -261,11 +261,6 @@ void free_time_ns(struct time_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); -} - static struct ns_common *timens_get(struct task_struct *task) { struct time_namespace *ns = NULL; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ade5b6806c5c..cfb0e28f2779 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1325,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns) } EXPORT_SYMBOL(current_in_userns); -static inline struct user_namespace *to_user_ns(struct ns_common *ns) -{ - return container_of(ns, struct user_namespace, ns); -} - static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; diff --git a/kernel/utsname.c b/kernel/utsname.c index 64155417ae0c..a682830742d3 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -103,11 +103,6 @@ void free_uts_ns(struct uts_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); -} - static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 169ec22c4758..a57b3cda8dbc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1541,11 +1541,6 @@ static struct ns_common *netns_get(struct task_struct *task) return net ? &net->ns : NULL; } -static inline struct net *to_net_ns(struct ns_common *ns) -{ - return container_of(ns, struct net, ns); -} - static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); -- cgit v1.2.3 From 99d33ce100cbc982647c9299cadb1277cfad503e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:57 +0200 Subject: net: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/net/net_namespace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fd090ceb80bf..3e7c825e5810 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -270,7 +270,7 @@ static inline struct net *to_net_ns(struct ns_common *ns) /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { - refcount_inc(&net->ns.count); + ns_ref_inc(net); return net; } @@ -281,7 +281,7 @@ static inline struct net *maybe_get_net(struct net *net) * exists. If the reference count is zero this * function fails and returns NULL. */ - if (!refcount_inc_not_zero(&net->ns.count)) + if (!ns_ref_get(net)) net = NULL; return net; } @@ -289,7 +289,7 @@ static inline struct net *maybe_get_net(struct net *net) /* Try using put_net_track() instead */ static inline void put_net(struct net *net) { - if (refcount_dec_and_test(&net->ns.count)) + if (ns_ref_put(net)) __put_net(net); } @@ -301,7 +301,7 @@ int net_eq(const struct net *net1, const struct net *net2) static inline int check_net(const struct net *net) { - return refcount_read(&net->ns.count) != 0; + return ns_ref_read(net) != 0; } void net_drop_ns(void *); -- cgit v1.2.3 From f8d2f8205be8cceef2dd3c0e68e7af3c5f83c75c Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Thu, 18 Sep 2025 08:52:02 -0700 Subject: psp: make struct sock argument const in psp_sk_get_assoc_rcu() This function does not need a mutable reference to its argument. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20250918155205.2197603-2-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- include/net/psp/functions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 91ba06733321..fb3cbe8427ea 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -124,7 +124,7 @@ psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); } -static inline struct psp_assoc *psp_sk_get_assoc_rcu(struct sock *sk) +static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) { struct inet_timewait_sock *tw; struct psp_assoc *pas; -- cgit v1.2.3 From 803cdb6ddca3e24418226e17e4b1c1134619aca8 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Thu, 18 Sep 2025 08:52:03 -0700 Subject: psp: fix preemptive inet_twsk() cast in psp_sk_get_assoc_rcu() It is weird to cast to a timewait_sock before checking sk_state, even if the use is after such a check. Remove the tw local variable, and use inet_twsk() directly in the timewait branch. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20250918155205.2197603-3-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- include/net/psp/functions.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index fb3cbe8427ea..980de7e58f8a 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -126,7 +126,6 @@ psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) { - struct inet_timewait_sock *tw; struct psp_assoc *pas; int state; @@ -134,9 +133,9 @@ static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) if (!sk_is_inet(sk) || state & TCPF_NEW_SYN_RECV) return NULL; - tw = inet_twsk(sk); - pas = state & TCPF_TIME_WAIT ? rcu_dereference(tw->psp_assoc) : - rcu_dereference(sk->psp_assoc); + pas = state & TCPF_TIME_WAIT ? + rcu_dereference(inet_twsk(sk)->psp_assoc) : + rcu_dereference(sk->psp_assoc); return pas; } -- cgit v1.2.3 From 28bb24dadd0ed70aed43cf9af3a54c22c3ce04b2 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Thu, 18 Sep 2025 08:52:04 -0700 Subject: psp: don't use flags for checking sk_state Using flags to check sk_state only makes sense to check for a subset of states in parallel e.g. sk_fullsock(). We are not doing that here. Compare for individual states directly. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20250918155205.2197603-4-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- include/net/psp/functions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 980de7e58f8a..ef7743664da3 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -129,11 +129,11 @@ static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) struct psp_assoc *pas; int state; - state = 1 << READ_ONCE(sk->sk_state); - if (!sk_is_inet(sk) || state & TCPF_NEW_SYN_RECV) + state = READ_ONCE(sk->sk_state); + if (!sk_is_inet(sk) || state == TCP_NEW_SYN_RECV) return NULL; - pas = state & TCPF_TIME_WAIT ? + pas = state == TCP_TIME_WAIT ? rcu_dereference(inet_twsk(sk)->psp_assoc) : rcu_dereference(sk->psp_assoc); return pas; -- cgit v1.2.3 From f1bf77491d5e48ab5477f585ee5fca2aa524bd15 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 18 Sep 2025 19:25:35 +0000 Subject: psp: Fix typo in kdoc for struct psp_dev_caps.assoc_drv_spc. assoc_drv_spc is the size of psp_assoc.drv_data[]. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250918192539.1587586-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/psp/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/psp/types.h b/include/net/psp/types.h index d9688e66cf09..31cee64b7c86 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -98,7 +98,7 @@ struct psp_dev_caps { /** * @assoc_drv_spc: size of driver-specific state in Tx assoc - * Determines the size of struct psp_assoc::drv_spc + * Determines the size of struct psp_assoc::drv_data */ u32 assoc_drv_spc; }; -- cgit v1.2.3 From 1c7e4a618509476658bafba35fffb3a5cfb213b1 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 15 Sep 2025 11:19:54 +0200 Subject: net: ipv4: make udp_v4_early_demux explicitly return drop reason udp_v4_early_demux already returns drop reasons as it either returns 0 or ip_mc_validate_source, which itself returns drop reasons. Its return value is also already used as a drop reason itself. Makes this explicit by making it return drop reasons. Signed-off-by: Antoine Tenart Reviewed-by: David Ahern Link: https://patch.msgid.link/20250915091958.15382-2-atenart@kernel.org Signed-off-by: Jakub Kicinski --- include/net/udp.h | 2 +- net/ipv4/ip_input.c | 2 +- net/ipv4/udp.c | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index eecd64097f91..059a0cee5f55 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -404,7 +404,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, return __skb_recv_udp(sk, flags, &off, err); } -int udp_v4_early_demux(struct sk_buff *skb); +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index a09aca2c8567..8878e865ddf6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -319,7 +319,7 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, } int tcp_v4_early_demux(struct sk_buff *skb); -int udp_v4_early_demux(struct sk_buff *skb); +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0c40426628eb..85cfc32eb2cc 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2811,7 +2811,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -int udp_v4_early_demux(struct sk_buff *skb) +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct in_device *in_dev = NULL; @@ -2825,7 +2825,7 @@ int udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return 0; + return SKB_NOT_DROPPED_YET; iph = ip_hdr(skb); uh = udp_hdr(skb); @@ -2834,12 +2834,12 @@ int udp_v4_early_demux(struct sk_buff *skb) in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return 0; + return SKB_NOT_DROPPED_YET; ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return 0; + return SKB_NOT_DROPPED_YET; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, @@ -2850,7 +2850,7 @@ int udp_v4_early_demux(struct sk_buff *skb) } if (!sk) - return 0; + return SKB_NOT_DROPPED_YET; skb->sk = sk; DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); @@ -2877,7 +2877,7 @@ int udp_v4_early_demux(struct sk_buff *skb) ip4h_dscp(iph), skb->dev, in_dev, &itag); } - return 0; + return SKB_NOT_DROPPED_YET; } int udp_rcv(struct sk_buff *skb) -- cgit v1.2.3 From 9e622804d57e2d08f0271200606bd1270f75126f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Aug 2025 11:10:20 -0400 Subject: Bluetooth: hci_event: Fix UAF in hci_acl_create_conn_sync This fixes the following UFA in hci_acl_create_conn_sync where a connection still pending is command submission (conn->state == BT_OPEN) maybe freed, also since this also can happen with the likes of hci_le_create_conn_sync fix it as well: BUG: KASAN: slab-use-after-free in hci_acl_create_conn_sync+0x5ef/0x790 net/bluetooth/hci_sync.c:6861 Write of size 2 at addr ffff88805ffcc038 by task kworker/u11:2/9541 CPU: 1 UID: 0 PID: 9541 Comm: kworker/u11:2 Not tainted 6.16.0-rc7 #3 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Workqueue: hci3 hci_cmd_sync_work Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x230 mm/kasan/report.c:480 kasan_report+0x118/0x150 mm/kasan/report.c:593 hci_acl_create_conn_sync+0x5ef/0x790 net/bluetooth/hci_sync.c:6861 hci_cmd_sync_work+0x210/0x3a0 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Allocated by task 123736: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] __hci_conn_add+0x233/0x1b30 net/bluetooth/hci_conn.c:939 hci_conn_add_unset net/bluetooth/hci_conn.c:1051 [inline] hci_connect_acl+0x16c/0x4e0 net/bluetooth/hci_conn.c:1634 pair_device+0x418/0xa70 net/bluetooth/mgmt.c:3556 hci_mgmt_cmd+0x9c9/0xef0 net/bluetooth/hci_sock.c:1719 hci_sock_sendmsg+0x6ca/0xef0 net/bluetooth/hci_sock.c:1839 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:727 sock_write_iter+0x258/0x330 net/socket.c:1131 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x54b/0xa90 fs/read_write.c:686 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 103680: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x18e/0x440 mm/slub.c:4842 device_release+0x9c/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x22b/0x480 lib/kobject.c:737 hci_conn_cleanup net/bluetooth/hci_conn.c:175 [inline] hci_conn_del+0x8ff/0xcb0 net/bluetooth/hci_conn.c:1173 hci_conn_complete_evt+0x3c7/0x1040 net/bluetooth/hci_event.c:3199 hci_event_func net/bluetooth/hci_event.c:7477 [inline] hci_event_packet+0x7e0/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Last potentially related work creation: kasan_save_stack+0x3e/0x60 mm/kasan/common.c:47 kasan_record_aux_stack+0xbd/0xd0 mm/kasan/generic.c:548 insert_work+0x3d/0x330 kernel/workqueue.c:2183 __queue_work+0xbd9/0xfe0 kernel/workqueue.c:2345 queue_delayed_work_on+0x18b/0x280 kernel/workqueue.c:2561 pairing_complete+0x1e7/0x2b0 net/bluetooth/mgmt.c:3451 pairing_complete_cb+0x1ac/0x230 net/bluetooth/mgmt.c:3487 hci_connect_cfm include/net/bluetooth/hci_core.h:2064 [inline] hci_conn_failed+0x24d/0x310 net/bluetooth/hci_conn.c:1275 hci_conn_complete_evt+0x3c7/0x1040 net/bluetooth/hci_event.c:3199 hci_event_func net/bluetooth/hci_event.c:7477 [inline] hci_event_packet+0x7e0/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Fixes: aef2aa4fa98e ("Bluetooth: hci_event: Fix creating hci_conn object on error status") Reported-by: Junvyyang, Tencent Zhuque Lab Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 21 +++++++++++++++++++++ net/bluetooth/hci_event.c | 26 +++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6906af7a8f24..6560b32f3125 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1245,6 +1245,27 @@ static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_role(struct hci_dev *hdev, + __u8 type, __u8 role, + bdaddr_t *ba) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == type && c->role == role && !bacmp(&c->dst, ba)) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev, bdaddr_t *ba, __u8 ba_type) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 97f543824bb0..fe49e8a7969f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3087,8 +3087,18 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + /* Check for existing connection: + * + * 1. If it doesn't exist then it must be receiver/slave role. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); - if (!conn) { + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ @@ -5638,8 +5648,18 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr); - if (!conn) { + /* Check for existing connection: + * + * 1. If it doesn't exist then use the role to create a new object. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ + conn = hci_conn_hash_lookup_role(hdev, LE_LINK, role, bdaddr); + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ -- cgit v1.2.3 From e0d3bba84ff8b82d4e8820856a7850afb17c14f9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 19 Sep 2025 12:23:25 +0200 Subject: wifi: cfg80211: remove IEEE80211_CHAN_{1,2,4,8,16}MHZ flags These were used by S1G for older chandef representation, but are no longer needed. Clean them up, even if we can't drop them from the userspace API entirely. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 16 +--------------- net/wireless/nl80211.c | 15 --------------- 2 files changed, 1 insertion(+), 30 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1c041ce7a03b..781624f5913a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -101,16 +101,6 @@ struct wiphy; * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted * on this channel. * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel. - * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted - * on this channel. * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band, * this flag indicates that a 320 MHz channel cannot use this * channel as the control or any of the secondary channels. @@ -152,11 +142,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_20MHZ = BIT(11), IEEE80211_CHAN_NO_10MHZ = BIT(12), IEEE80211_CHAN_NO_HE = BIT(13), - IEEE80211_CHAN_1MHZ = BIT(14), - IEEE80211_CHAN_2MHZ = BIT(15), - IEEE80211_CHAN_4MHZ = BIT(16), - IEEE80211_CHAN_8MHZ = BIT(17), - IEEE80211_CHAN_16MHZ = BIT(18), + /* can use free bits here */ IEEE80211_CHAN_NO_320MHZ = BIT(19), IEEE80211_CHAN_NO_EHT = BIT(20), IEEE80211_CHAN_DFS_CONCURRENT = BIT(21), diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index de34a1d14073..346dfd2bd987 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1280,21 +1280,6 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_HE) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_1MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_2MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_4MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_8MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_16MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ)) - goto nla_put_failure; if ((chan->flags & IEEE80211_CHAN_NO_320MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_320MHZ)) goto nla_put_failure; -- cgit v1.2.3 From 6445bb832dc0ba0ab816e5bd79ef0209cdd46d3a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 19 Sep 2025 08:35:28 +0000 Subject: tcp: Remove osk from __inet_hash() arg. __inet_hash() is called from inet_hash() and inet6_hash with osk NULL. Let's remove the 2nd arg from __inet_hash(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250919083706.1863217-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 2 +- net/ipv4/inet_hashtables.c | 6 +++--- net/ipv6/inet6_hashtables.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index a3b32241c2f2..64bc8870db88 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -289,7 +289,7 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); -int __inet_hash(struct sock *sk, struct sock *osk); +int __inet_hash(struct sock *sk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ef4ccfd46ff6..baee5c075e6c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -739,7 +739,7 @@ static int inet_reuseport_add_sock(struct sock *sk, return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } -int __inet_hash(struct sock *sk, struct sock *osk) +int __inet_hash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; @@ -747,7 +747,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); - inet_ehash_nolisten(sk, osk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); local_bh_enable(); return 0; } @@ -779,7 +779,7 @@ int inet_hash(struct sock *sk) int err = 0; if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); + err = __inet_hash(sk); return err; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index a3a9ea49fee2..64fcd7df0c9a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -374,7 +374,7 @@ int inet6_hash(struct sock *sk) int err = 0; if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); + err = __inet_hash(sk); return err; } -- cgit v1.2.3 From 0ac44301e3bf4f5abc892ab530188ca95c61e59f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 19 Sep 2025 08:35:29 +0000 Subject: tcp: Remove inet6_hash(). inet_hash() and inet6_hash() are exactly the same. Also, we do not need to export inet6_hash(). Let's consolidate the two into __inet_hash() and rename it to inet_hash(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250919083706.1863217-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/inet6_hashtables.h | 2 -- include/net/inet_hashtables.h | 1 - net/ipv4/inet_hashtables.c | 17 +++++------------ net/ipv6/inet6_hashtables.c | 11 ----------- net/ipv6/tcp_ipv6.c | 2 +- 5 files changed, 6 insertions(+), 27 deletions(-) (limited to 'include/net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 1f985d2012ce..282e29237d93 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -167,8 +167,6 @@ struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *daddr, const __be16 dport, const int dif); -int inet6_hash(struct sock *sk); - static inline bool inet6_match(const struct net *net, const struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 64bc8870db88..b787be651ce7 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -289,7 +289,6 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); -int __inet_hash(struct sock *sk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index baee5c075e6c..efa8a615b868 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -739,12 +739,15 @@ static int inet_reuseport_add_sock(struct sock *sk, return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } -int __inet_hash(struct sock *sk) +int inet_hash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; + if (sk->sk_state == TCP_CLOSE) + return 0; + if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); inet_ehash_nolisten(sk, NULL, NULL); @@ -772,17 +775,7 @@ unlock: return err; } -EXPORT_IPV6_MOD(__inet_hash); - -int inet_hash(struct sock *sk) -{ - int err = 0; - - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk); - - return err; -} +EXPORT_IPV6_MOD(inet_hash); void inet_unhash(struct sock *sk) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 64fcd7df0c9a..5e1da088d8e1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -368,14 +368,3 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); - -int inet6_hash(struct sock *sk) -{ - int err = 0; - - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk); - - return err; -} -EXPORT_SYMBOL_GPL(inet6_hash); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d1e5b2a186fb..9622c2776ade 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2355,7 +2355,7 @@ struct proto tcpv6_prot = { .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet6_hash, + .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, -- cgit v1.2.3 From 17b14d235f58155a05cd9371e4559361ca3c67da Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:49 +0000 Subject: net: move sk_uid and sk_protocol to sock_read_tx sk_uid and sk_protocol are read from inet6_csk_route_socket() for each TCP transmit. Also read from udpv6_sendmsg(), udp_sendmsg() and others. Move them to sock_read_tx for better cache locality. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 6 +++--- net/core/sock.c | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index ee95081b0c0b..66c2f396b57d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -492,6 +492,9 @@ struct sock { long sk_sndtimeo; u32 sk_priority; u32 sk_mark; + kuid_t sk_uid; + u16 sk_protocol; + u16 sk_type; struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; #ifdef CONFIG_SOCK_VALIDATE_XMIT @@ -517,15 +520,12 @@ struct sock { sk_no_check_tx : 1, sk_no_check_rx : 1; u8 sk_shutdown; - u16 sk_type; - u16 sk_protocol; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; - kuid_t sk_uid; unsigned long sk_ino; spinlock_t sk_peer_lock; int sk_bind_phc; diff --git a/net/core/sock.c b/net/core/sock.c index 21742da19e45..ad79efde4476 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4471,6 +4471,8 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); -- cgit v1.2.3 From 9303c3ced111803dcd1aa36a778f290977935ca5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:50 +0000 Subject: net: move sk->sk_err_soft and sk->sk_sndbuf sk->sk_sndbuf is read-mostly in tx path, so move it from sock_write_tx group to more appropriate sock_read_tx. sk->sk_err_soft was not identified previously, but is used from tcp_ack(). Move it to sock_write_tx group for better cache locality. Also change tcp_ack() to clear sk->sk_err_soft only if needed. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 ++-- net/core/sock.c | 3 ++- net/ipv4/tcp_input.c | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 66c2f396b57d..b4fefeea0213 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -467,7 +467,7 @@ struct sock { __cacheline_group_begin(sock_write_tx); int sk_write_pending; atomic_t sk_omem_alloc; - int sk_sndbuf; + int sk_err_soft; int sk_wmem_queued; refcount_t sk_wmem_alloc; @@ -507,6 +507,7 @@ struct sock { unsigned int sk_gso_max_size; gfp_t sk_allocation; u32 sk_txhash; + int sk_sndbuf; u8 sk_pacing_shift; bool sk_use_task_frag; __cacheline_group_end(sock_read_tx); @@ -523,7 +524,6 @@ struct sock { unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; - int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; unsigned long sk_ino; diff --git a/net/core/sock.c b/net/core/sock.c index ad79efde4476..dc03d4b5909a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4452,7 +4452,7 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); @@ -4479,6 +4479,7 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9fdc6ce25eb1..f93d48d98d5d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4085,7 +4085,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* We passed data and got it acked, remove any soft error * log. Something worked... */ - WRITE_ONCE(sk->sk_err_soft, 0); + if (READ_ONCE(sk->sk_err_soft)) + WRITE_ONCE(sk->sk_err_soft, 0); WRITE_ONCE(icsk->icsk_probes_out, 0); tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) -- cgit v1.2.3 From 649091ef597bb7de34dd8ceea39bbc4252970558 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:56 +0000 Subject: tcp: reclaim 8 bytes in struct request_sock_queue synflood_warned had to be u32 for xchg(), but ensuring atomicity is not really needed. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/request_sock.h | 2 +- net/ipv4/tcp_input.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 6a5ec1418e85..cd4d4cf71d0d 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -185,8 +185,8 @@ struct fastopen_queue { struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + u8 synflood_warned; - u32 synflood_warned; atomic_t qlen; atomic_t young; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f93d48d98d5d..79d5252ed6cc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7282,8 +7282,8 @@ static bool tcp_syn_flood_action(struct sock *sk, const char *proto) #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 && - xchg(&queue->synflood_warned, 1) == 0) { + if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) { + WRITE_ONCE(queue->synflood_warned, 1); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", proto, inet6_rcv_saddr(sk), -- cgit v1.2.3 From d57f4b874946e997be52f5ebb5e0e1dad368c16f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 17 Sep 2025 15:22:04 +0200 Subject: tcp: Update bind bucket state on port release Today, once an inet_bind_bucket enters a state where fastreuse >= 0 or fastreuseport >= 0 after a socket is explicitly bound to a port, it remains in that state until all sockets are removed and the bucket is destroyed. In this state, the bucket is skipped during ephemeral port selection in connect(). For applications using a reduced ephemeral port range (IP_LOCAL_PORT_RANGE socket option), this can cause faster port exhaustion since blocked buckets are excluded from reuse. The reason the bucket state isn't updated on port release is unclear. Possibly a performance trade-off to avoid scanning bucket owners, or just an oversight. Fix it by recalculating the bucket state when a socket releases a port. To limit overhead, each inet_bind2_bucket stores its own (fastreuse, fastreuseport) state. On port release, only the relevant port-addr bucket is scanned, and the overall state is derived from these. Signed-off-by: Jakub Sitnicki Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250917-update-bind-bucket-state-on-unhash-v5-1-57168b661b47@cloudflare.com Signed-off-by: Paolo Abeni --- include/net/inet_connection_sock.h | 5 +++-- include/net/inet_hashtables.h | 2 ++ include/net/inet_timewait_sock.h | 3 ++- include/net/sock.h | 4 ++++ net/ipv4/inet_connection_sock.c | 12 +++++++---- net/ipv4/inet_hashtables.c | 44 +++++++++++++++++++++++++++++++++++++- net/ipv4/inet_timewait_sock.c | 1 + 7 files changed, 63 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 0737d8e178dd..b4b886647607 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -316,8 +316,9 @@ int inet_csk_listen_start(struct sock *sk); void inet_csk_listen_stop(struct sock *sk); /* update the fast reuse flag when adding a socket */ -void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, - struct sock *sk); +void inet_csk_update_fastreuse(const struct sock *sk, + struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2); struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index b787be651ce7..ac05a52d9e13 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -108,6 +108,8 @@ struct inet_bind2_bucket { struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; + signed char fastreuse; + signed char fastreuseport; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 3a31c74c9e15..63a644ff30de 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -70,7 +70,8 @@ struct inet_timewait_sock { unsigned int tw_transparent : 1, tw_flowlabel : 20, tw_usec_ts : 1, - tw_pad : 2, /* 2 bits hole */ + tw_connect_bind : 1, + tw_pad : 1, /* 1 bit hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; diff --git a/include/net/sock.h b/include/net/sock.h index b4fefeea0213..8c5b64f41ab7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1494,6 +1494,10 @@ static inline int __sk_prot_rehash(struct sock *sk) #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 +/** + * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time + */ +#define SOCK_CONNECT_BIND 16 struct socket_alloc { struct socket socket; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 142ff8d86fc2..cdd1e12aac8c 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -423,7 +423,7 @@ success: } static inline int sk_reuseport_match(struct inet_bind_bucket *tb, - struct sock *sk) + const struct sock *sk) { if (tb->fastreuseport <= 0) return 0; @@ -453,8 +453,9 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, ipv6_only_sock(sk), true, false); } -void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, - struct sock *sk) +void inet_csk_update_fastreuse(const struct sock *sk, + struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; @@ -501,6 +502,9 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, tb->fastreuseport = 0; } } + + tb2->fastreuse = tb->fastreuse; + tb2->fastreuseport = tb->fastreuseport; } /* Obtain a reference to a local port for the given sock, @@ -582,7 +586,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) } success: - inet_csk_update_fastreuse(tb, sk); + inet_csk_update_fastreuse(sk, tb, tb2); if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, tb2, port); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4eb933f56fe6..b7024e3d9ac3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -58,6 +58,14 @@ static u32 sk_ehashfn(const struct sock *sk) sk->sk_daddr, sk->sk_dport); } +static bool sk_is_connect_bind(const struct sock *sk) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return inet_twsk(sk)->tw_connect_bind; + else + return sk->sk_userlocks & SOCK_CONNECT_BIND; +} + /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. @@ -87,10 +95,22 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, */ void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) { + const struct inet_bind2_bucket *tb2; + if (hlist_empty(&tb->bhash2)) { hlist_del_rcu(&tb->node); kfree_rcu(tb, rcu); + return; + } + + if (tb->fastreuse == -1 && tb->fastreuseport == -1) + return; + hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { + if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) + return; } + tb->fastreuse = -1; + tb->fastreuseport = -1; } bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, @@ -121,6 +141,8 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, #else tb2->rcv_saddr = sk->sk_rcv_saddr; #endif + tb2->fastreuse = 0; + tb2->fastreuseport = 0; INIT_HLIST_HEAD(&tb2->owners); hlist_add_head(&tb2->node, &head->chain); hlist_add_head(&tb2->bhash_node, &tb->bhash2); @@ -143,11 +165,23 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { + const struct sock *sk; + if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); + return; } + + if (tb->fastreuse == -1 && tb->fastreuseport == -1) + return; + sk_for_each_bound(sk, &tb->owners) { + if (!sk_is_connect_bind(sk)) + return; + } + tb->fastreuse = -1; + tb->fastreuseport = -1; } static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, @@ -191,6 +225,7 @@ static void __inet_put_port(struct sock *sk) tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; + sk->sk_userlocks &= ~SOCK_CONNECT_BIND; spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { @@ -277,7 +312,7 @@ bhash2_find: } } if (update_fastreuse) - inet_csk_update_fastreuse(tb, child); + inet_csk_update_fastreuse(child, tb, tb2); inet_bind_hash(child, tb, tb2, port); spin_unlock(&head2->lock); spin_unlock(&head->lock); @@ -950,6 +985,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, if (!tb2) { tb2 = new_tb2; inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); + if (sk_is_connect_bind(sk)) { + tb2->fastreuse = -1; + tb2->fastreuseport = -1; + } } inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); @@ -1120,6 +1159,8 @@ ok: head2, tb, sk); if (!tb2) goto error; + tb2->fastreuse = -1; + tb2->fastreuseport = -1; } /* Here we want to add a little bit of randomness to the next source @@ -1132,6 +1173,7 @@ ok: /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); + sk->sk_userlocks |= SOCK_CONNECT_BIND; if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2ca2912f61f4..e1a86130f038 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -208,6 +208,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); + tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND); tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); -- cgit v1.2.3 From 884eee8e43f3072db4111178c98b9aa5c57bcf92 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:47 +0200 Subject: net/smc: Remove error handling of unregister_dmb() smcd_buf_free() calls smc_ism_unregister_dmb(lgr->smcd, buf_desc) and then unconditionally frees buf_desc. Remove the cleaning up of fields of buf_desc in smc_ism_unregister_dmb(), because it is not helpful. This removes the only usage of ISM_ERROR from the smc module. So move it to drivers/s390/net/ism.h. Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Reviewed-by: Dust Li Link: https://patch.msgid.link/20250918110500.1731261-2-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/s390/net/ism.h | 1 + include/net/smc.h | 2 -- net/smc/smc_ism.c | 14 +++++--------- net/smc/smc_ism.h | 3 ++- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 047fa6101555..b5b03db52fce 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -10,6 +10,7 @@ #include #define UTIL_STR_LEN 16 +#define ISM_ERROR 0xFFFF /* * Do not use the first word of the DMB bits to ensure 8 byte aligned access. diff --git a/include/net/smc.h b/include/net/smc.h index db84e4e35080..a9c023dd1380 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -44,8 +44,6 @@ struct smcd_dmb { #define ISM_RESERVED_VLANID 0x1FFF -#define ISM_ERROR 0xFFFF - struct smcd_dev; struct smcd_gid { diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index a58ffb7a0610..fca01b95b65a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -205,13 +205,13 @@ out: return rc; } -int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +void smc_ism_unregister_dmb(struct smcd_dev *smcd, + struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; - int rc = 0; if (!dmb_desc->dma_addr) - return rc; + return; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; @@ -219,13 +219,9 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - rc = smcd->ops->unregister_dmb(smcd, &dmb); - if (!rc || rc == ISM_ERROR) { - dmb_desc->cpu_addr = NULL; - dmb_desc->dma_addr = 0; - } + smcd->ops->unregister_dmb(smcd, &dmb); - return rc; + return; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 6763133dd8d0..765aa8fae6fa 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -47,7 +47,8 @@ int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); -int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +void smc_ism_unregister_dmb(struct smcd_dev *dev, + struct smc_buf_desc *dmb_desc); bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd); int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc); -- cgit v1.2.3 From 69baaac9361edd169713562f088829a1be9c51a9 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:53 +0200 Subject: dibs: Define dibs_client_ops and dibs_dev_ops Move the device add() and remove() functions from ism_client to dibs_client_ops and call add_dev()/del_dev() for ism devices and dibs_loopback devices. dibs_client_ops->add_dev() = smcd_register_dev() for the smc_dibs_client. This is the first step to handle ism and loopback devices alike (as dibs devices) in the smc dibs client. Define dibs_dev->ops and move smcd_ops->get_chid to dibs_dev_ops->get_fabric_id() for ism and loopback devices. See below for why this needs to be in the same patch as dibs_client_ops->add_dev(). The following changes contain intermediate steps, that will be obsoleted by follow-on patches, once more functionality has been moved to dibs: Use different smcd_ops and max_dmbs for ism and loopback. Follow-on patches will change SMC-D to directly use dibs_ops instead of smcd_ops. In smcd_register_dev() it is now necessary to identify a dibs_loopback device before smcd_dev and smcd_ops->get_chid() are available. So provide dibs_dev_ops->get_fabric_id() in this patch and evaluate it in smc_ism_is_loopback(). Call smc_loopback_init() in smcd_register_dev() and call smc_loopback_exit() in smcd_unregister_dev() to handle the functionality that is still in smc_loopback. Follow-on patches will move all smc_loopback code to dibs_loopback. In smcd_[un]register_dev() use only ism device name, this will be replaced by dibs device name by a follow-on patch. End of changes with intermediate parts. Allocate an smcd event workqueue for all dibs devices, although dibs_loopback does not generate events. Use kernel memory instead of devres memory for smcd_dev and smcd->conn. Since commit a72178cfe855 ("net/smc: Fix dependency of SMC on ISM") an ism device and its driver can have a longer lifetime than the smc module, so smc should not rely on devres to free its resources [1]. It is now the responsibility of the smc client to free smcd and smcd->conn for all dibs devices, ism devices as well as loopback. Call client->ops->del_dev() for all existing dibs devices in dibs_unregister_client(), so all device related structures can be freed in the client. When dibs_unregister_client() is called in the context of smc_exit() or smc_core_reboot_event(), these functions have already called smc_lgrs_shutdown() which calls smc_smcd_terminate_all(smcd) and sets going_away. This is done a second time in smcd_unregister_dev(). This is analogous to how smcr is handled in these functions, by calling first smc_lgrs_shutdown() and then smc_ib_unregister_client() > smc_ib_remove_dev(), so leave it that way. It may be worth investigating, whether smc_lgrs_shutdown() is still required or useful. Remove CONFIG_SMC_LO. CONFIG_DIBS_LO now controls whether a dibs loopback device exists or not. Link: https://www.kernel.org/doc/Documentation/driver-model/devres.txt [1] Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-8-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - drivers/dibs/dibs_loopback.c | 11 ++++ drivers/dibs/dibs_main.c | 36 +++++++++++ drivers/s390/net/ism_drv.c | 43 ++++++------- include/linux/dibs.h | 89 ++++++++++++++++++++++++- include/linux/ism.h | 2 - include/net/smc.h | 3 +- net/smc/Kconfig | 13 ---- net/smc/Makefile | 2 +- net/smc/af_smc.c | 12 +--- net/smc/smc_ism.c | 132 ++++++++++++++++++++++++++------------ net/smc/smc_ism.h | 7 +- net/smc/smc_loopback.c | 94 ++++----------------------- net/smc/smc_loopback.h | 17 +---- 15 files changed, 270 insertions(+), 193 deletions(-) (limited to 'include/net') diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 5a2ed07b6198..a97c8d19f643 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -123,7 +123,6 @@ CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y CONFIG_SMC_DIAG=m -CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 4cbdd7e2ff9f..7f7b52d9a33c 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -114,7 +114,6 @@ CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y CONFIG_SMC_DIAG=m -CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 225514a452a8..215986ae54a4 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -18,6 +18,15 @@ /* global loopback device */ static struct dibs_lo_dev *lo_dev; +static u16 dibs_lo_get_fabric_id(struct dibs_dev *dibs) +{ + return DIBS_LOOPBACK_FABRIC; +} + +static const struct dibs_dev_ops dibs_lo_ops = { + .get_fabric_id = dibs_lo_get_fabric_id, +}; + static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) { dibs_dev_del(ldev->dibs); @@ -40,6 +49,8 @@ static int dibs_lo_dev_probe(void) } ldev->dibs = dibs; + dibs->drv_priv = ldev; + dibs->ops = &dibs_lo_ops; ret = dibs_dev_add(dibs); if (ret) diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index a7e33be36158..f1cfa5849277 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -36,8 +36,10 @@ static struct dibs_dev_list dibs_dev_list = { int dibs_register_client(struct dibs_client *client) { + struct dibs_dev *dibs; int i, rc = -ENOSPC; + mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { if (!clients[i]) { @@ -51,19 +53,37 @@ int dibs_register_client(struct dibs_client *client) } mutex_unlock(&clients_lock); + if (i < MAX_DIBS_CLIENTS) { + /* initialize with all devices that we got so far */ + list_for_each_entry(dibs, &dibs_dev_list.list, list) { + dibs->priv[i] = NULL; + client->ops->add_dev(dibs); + } + } + mutex_unlock(&dibs_dev_list.mutex); + return rc; } EXPORT_SYMBOL_GPL(dibs_register_client); int dibs_unregister_client(struct dibs_client *client) { + struct dibs_dev *dibs; int rc = 0; + mutex_lock(&dibs_dev_list.mutex); + list_for_each_entry(dibs, &dibs_dev_list.list, list) { + clients[client->id]->ops->del_dev(dibs); + dibs->priv[client->id] = NULL; + } + mutex_lock(&clients_lock); clients[client->id] = NULL; if (client->id + 1 == max_client) max_client--; mutex_unlock(&clients_lock); + + mutex_unlock(&dibs_dev_list.mutex); return rc; } EXPORT_SYMBOL_GPL(dibs_unregister_client); @@ -80,7 +100,15 @@ EXPORT_SYMBOL_GPL(dibs_dev_alloc); int dibs_dev_add(struct dibs_dev *dibs) { + int i; + mutex_lock(&dibs_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->ops->add_dev(dibs); + } + mutex_unlock(&clients_lock); list_add(&dibs->list, &dibs_dev_list.list); mutex_unlock(&dibs_dev_list.mutex); @@ -90,7 +118,15 @@ EXPORT_SYMBOL_GPL(dibs_dev_add); void dibs_dev_del(struct dibs_dev *dibs) { + int i; + mutex_lock(&dibs_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->ops->del_dev(dibs); + } + mutex_unlock(&clients_lock); list_del_init(&dibs->list); mutex_unlock(&dibs_dev_list.mutex); } diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 8ecd0cccc7e8..2bd8f64ebb56 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -79,7 +79,6 @@ int ism_register_client(struct ism_client *client) /* initialize with all devices that we got so far */ list_for_each_entry(ism, &ism_dev_list.list, list) { ism->priv[i] = NULL; - client->add(ism); ism_setup_forwarding(client, ism); } } @@ -465,6 +464,16 @@ int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, } EXPORT_SYMBOL_GPL(ism_move); +static u16 ism_get_chid(struct dibs_dev *dibs) +{ + struct ism_dev *ism = dibs->drv_priv; + + if (!ism || !ism->pdev) + return 0; + + return to_zpci(ism->pdev)->pchid; +} + static void ism_handle_event(struct ism_dev *ism) { struct ism_event *entry; @@ -523,6 +532,10 @@ static irqreturn_t ism_handle_irq(int irq, void *data) return IRQ_HANDLED; } +static const struct dibs_dev_ops ism_ops = { + .get_fabric_id = ism_get_chid, +}; + static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; @@ -564,7 +577,6 @@ static int ism_dev_init(struct ism_dev *ism) mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { if (clients[i]) { - clients[i]->add(ism); ism_setup_forwarding(clients[i], ism); } } @@ -611,12 +623,6 @@ static void ism_dev_exit(struct ism_dev *ism) spin_unlock_irqrestore(&ism->lock, flags); mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < max_client; ++i) { - if (clients[i]) - clients[i]->remove(ism); - } - mutex_unlock(&clients_lock); if (ism_v2_capable) ism_del_vlan_id(ism, ISM_RESERVED_VLANID); @@ -672,7 +678,10 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = -ENOMEM; goto err_resource; } + /* set this up before we enable interrupts */ ism->dibs = dibs; + dibs->drv_priv = ism; + dibs->ops = &ism_ops; ret = ism_dev_init(ism); if (ret) @@ -857,19 +866,6 @@ static void smcd_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = 0; } -static u16 ism_get_chid(struct ism_dev *ism) -{ - if (!ism || !ism->pdev) - return 0; - - return to_zpci(ism->pdev)->pchid; -} - -static u16 smcd_get_chid(struct smcd_dev *smcd) -{ - return ism_get_chid(smcd->priv); -} - static inline struct device *smcd_get_dev(struct smcd_dev *dev) { struct ism_dev *ism = dev->priv; @@ -877,7 +873,7 @@ static inline struct device *smcd_get_dev(struct smcd_dev *dev) return &ism->dev; } -static const struct smcd_ops ism_ops = { +static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, @@ -889,13 +885,12 @@ static const struct smcd_ops ism_ops = { .move_data = smcd_move, .supports_v2 = smcd_supports_v2, .get_local_gid = smcd_get_local_gid, - .get_chid = smcd_get_chid, .get_dev = smcd_get_dev, }; const struct smcd_ops *ism_get_smcd_ops(void) { - return &ism_ops; + return &ism_smcd_ops; } EXPORT_SYMBOL_GPL(ism_get_smcd_ops); #endif diff --git a/include/linux/dibs.h b/include/linux/dibs.h index c12db19c98c0..805ab33271b5 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -34,14 +34,45 @@ * clients. */ +struct dibs_dev; + /* DIBS client * ----------- */ #define MAX_DIBS_CLIENTS 8 +/* All dibs clients have access to all dibs devices. + * A dibs client provides the following functions to be called by dibs layer or + * dibs device drivers: + */ +struct dibs_client_ops { + /** + * add_dev() - add a dibs device + * @dev: device that was added + * + * Will be called during dibs_register_client() for all existing + * dibs devices and whenever a new dibs device is registered. + * dev is usable until dibs_client.remove() is called. + * *dev is protected by device refcounting. + */ + void (*add_dev)(struct dibs_dev *dev); + /** + * del_dev() - remove a dibs device + * @dev: device to be removed + * + * Will be called whenever a dibs device is removed. + * Will be called during dibs_unregister_client() for all existing + * dibs devices and whenever a dibs device is unregistered. + * The device has already stopped initiative for this client: + * No new handlers will be started. + * The device is no longer usable by this client after this call. + */ + void (*del_dev)(struct dibs_dev *dev); +}; struct dibs_client { /* client name for logging and debugging purposes */ const char *name; + const struct dibs_client_ops *ops; /* client index - provided and used by dibs layer */ u8 id; }; @@ -52,6 +83,7 @@ struct dibs_client { * dibs_register_client() - register a client with dibs layer * @client: this client * + * Will call client->ops->add_dev() for all existing dibs devices. * Return: zero on success. */ int dibs_register_client(struct dibs_client *client); @@ -59,21 +91,74 @@ int dibs_register_client(struct dibs_client *client); * dibs_unregister_client() - unregister a client with dibs layer * @client: this client * + * Will call client->ops->del_dev() for all existing dibs devices. * Return: zero on success. */ int dibs_unregister_client(struct dibs_client *client); +/* dibs clients can call dibs device ops. */ + /* DIBS devices * ------------ */ + +/* Defined fabric id / CHID for all loopback devices: + * All dibs loopback devices report this fabric id. In this case devices with + * the same fabric id can NOT communicate via dibs. Only loopback devices with + * the same dibs device gid can communicate (=same device with itself). + */ +#define DIBS_LOOPBACK_FABRIC 0xFFFF + +/* A dibs device provides the following functions to be called by dibs clients. + * They are mandatory, unless marked 'optional'. + */ +struct dibs_dev_ops { + /** + * get_fabric_id() + * @dev: local dibs device + * + * Only devices on the same dibs fabric can communicate. Fabric_id is + * unique inside the same HW system. Use fabric_id for fast negative + * checks, but only query_remote_gid() can give a reliable positive + * answer: + * Different fabric_id: dibs is not possible + * Same fabric_id: dibs may be possible or not + * (e.g. different HW systems) + * EXCEPTION: DIBS_LOOPBACK_FABRIC denotes an ism_loopback device + * that can only communicate with itself. Use dibs_dev.gid + * or query_remote_gid()to determine whether sender and + * receiver use the same ism_loopback device. + * Return: 2 byte dibs fabric id + */ + u16 (*get_fabric_id)(struct dibs_dev *dev); +}; + struct dibs_dev { struct list_head list; + /* To be filled by device driver, before calling dibs_dev_add(): */ + const struct dibs_dev_ops *ops; + /* priv pointer for device driver */ + void *drv_priv; + + /* priv pointer per client; for client usage only */ + void *priv[MAX_DIBS_CLIENTS]; }; +static inline void dibs_set_priv(struct dibs_dev *dev, + struct dibs_client *client, void *priv) +{ + dev->priv[client->id] = priv; +} + +static inline void *dibs_get_priv(struct dibs_dev *dev, + struct dibs_client *client) +{ + return dev->priv[client->id]; +} + /* ------- End of client-only functions ----------- */ -/* - * Functions to be called by dibs device drivers: +/* Functions to be called by dibs device drivers: */ /** * dibs_dev_alloc() - allocate and reference device structure diff --git a/include/linux/ism.h b/include/linux/ism.h index 9a53d3c48c16..c818a25996db 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -59,8 +59,6 @@ struct ism_event { struct ism_client { const char *name; - void (*add)(struct ism_dev *dev); - void (*remove)(struct ism_dev *dev); void (*handle_event)(struct ism_dev *dev, struct ism_event *event); /* Parameter dmbemask contains a bit vector with updated DMBEs, if sent * via ism_move_data(). Callback function must handle all active bits diff --git a/include/net/smc.h b/include/net/smc.h index a9c023dd1380..e271891b85e6 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -15,6 +15,7 @@ #include #include #include +#include #include "linux/ism.h" struct sock; @@ -62,7 +63,6 @@ struct smcd_ops { unsigned int size); int (*supports_v2)(void); void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); - u16 (*get_chid)(struct smcd_dev *dev); struct device* (*get_dev)(struct smcd_dev *dev); /* optional operations */ @@ -81,6 +81,7 @@ struct smcd_dev { const struct smcd_ops *ops; void *priv; void *client; + struct dibs_dev *dibs; struct list_head list; spinlock_t lock; struct smc_connection **conn; diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 40dd60c1d23f..9535d88c2acb 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -20,16 +20,3 @@ config SMC_DIAG smcss. if unsure, say Y. - -config SMC_LO - bool "SMC intra-OS shortcut with loopback-ism" - depends on SMC - default n - help - SMC_LO enables the creation of an Emulated-ISM device named - loopback-ism in SMC and makes use of it for transferring data - when communication occurs within the same OS. This helps in - convenient testing of SMC-D since loopback-ism is independent - of architecture or hardware. - - if unsure, say N. diff --git a/net/smc/Makefile b/net/smc/Makefile index 60f1c87d5212..96ccfdf246df 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,4 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o -smc-$(CONFIG_SMC_LO) += smc_loopback.o +smc-y += smc_loopback.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9097e4f24d2b..77b99e8ef35a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -57,7 +57,6 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" -#include "smc_loopback.h" #include "smc_inet.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group @@ -3591,16 +3590,10 @@ static int __init smc_init(void) goto out_sock; } - rc = smc_loopback_init(); - if (rc) { - pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc); - goto out_ib; - } - rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_lo; + goto out_ib; } rc = smc_inet_init(); if (rc) { @@ -3611,8 +3604,6 @@ static int __init smc_init(void) return 0; out_ulp: tcp_unregister_ulp(&smc_ulp_ops); -out_lo: - smc_loopback_exit(); out_ib: smc_ib_unregister_client(); out_sock: @@ -3651,7 +3642,6 @@ static void __exit smc_exit(void) tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); - smc_loopback_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index a7a965e3c0ce..415f03910c91 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -15,6 +15,7 @@ #include "smc.h" #include "smc_core.h" #include "smc_ism.h" +#include "smc_loopback.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" @@ -28,23 +29,27 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; +static void smcd_register_dev(struct dibs_dev *dibs); +static void smcd_unregister_dev(struct dibs_dev *dibs); #if IS_ENABLED(CONFIG_ISM) -static void smcd_register_dev(struct ism_dev *ism); -static void smcd_unregister_dev(struct ism_dev *ism); static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", - .add = smcd_register_dev, - .remove = smcd_unregister_dev, .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; #endif +static struct dibs_client_ops smc_client_ops = { + .add_dev = smcd_register_dev, + .del_dev = smcd_unregister_dev, +}; + static struct dibs_client smc_dibs_client = { .name = "SMC-D", + .ops = &smc_client_ops, }; static void smc_ism_create_system_eid(void) @@ -86,7 +91,7 @@ void smc_ism_get_system_eid(u8 **eid) u16 smc_ism_get_chid(struct smcd_dev *smcd) { - return smcd->ops->get_chid(smcd); + return smcd->dibs->ops->get_fabric_id(smcd->dibs); } /* HW supports ISM V2 and thus System EID is defined */ @@ -318,7 +323,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); + smc_set_pci_values(ism->pdev, &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -368,7 +373,7 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; - if (smc_ism_is_loopback(smcd)) + if (smc_ism_is_loopback(smcd->dibs)) goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; @@ -453,24 +458,26 @@ static void smc_ism_event_work(struct work_struct *work) } kfree(wrk); } +#endif -static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, + const struct smcd_ops *ops, + int max_dmbs) { struct smcd_dev *smcd; - smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; - smcd->conn = devm_kcalloc(parent, max_dmbs, - sizeof(struct smc_connection *), GFP_KERNEL); + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); if (!smcd->conn) - return NULL; + goto free_smcd; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) - return NULL; + goto free_conn; smcd->ops = ops; @@ -480,27 +487,58 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; + +free_conn: + kfree(smcd->conn); +free_smcd: + kfree(smcd); + return NULL; } -static void smcd_register_dev(struct ism_dev *ism) +static void smcd_register_dev(struct dibs_dev *dibs) { - const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd, *fentry; + const struct smcd_ops *ops; + struct smc_lo_dev *smc_lo; + struct ism_dev *ism; - if (!ops) - return; + if (smc_ism_is_loopback(dibs)) { + if (smc_loopback_init(&smc_lo)) + return; + } - smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, - ISM_NR_DMBS); + if (smc_ism_is_loopback(dibs)) { + ops = smc_lo_get_smcd_ops(); + smcd = smcd_alloc_dev(dev_name(&smc_lo->dev), ops, + SMC_LO_MAX_DMBS); + } else { + ism = dibs->drv_priv; +#if IS_ENABLED(CONFIG_ISM) + ops = ism_get_smcd_ops(); +#endif + smcd = smcd_alloc_dev(dev_name(&ism->pdev->dev), ops, + ISM_NR_DMBS); + } if (!smcd) return; - smcd->priv = ism; - smcd->client = &smc_ism_client; - ism_set_priv(ism, &smc_ism_client, smcd); - if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); - if (smcd->ops->supports_v2()) + smcd->dibs = dibs; + dibs_set_priv(dibs, &smc_dibs_client, smcd); + + if (smc_ism_is_loopback(dibs)) { + smcd->priv = smc_lo; + smc_lo->smcd = smcd; + } else { + smcd->priv = ism; +#if IS_ENABLED(CONFIG_ISM) + ism_set_priv(ism, &smc_ism_client, smcd); + smcd->client = &smc_ism_client; +#endif + if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); + } + + if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); /* sort list: @@ -510,7 +548,7 @@ static void smcd_register_dev(struct ism_dev *ism) if (!smcd->pnetid[0]) { fentry = list_first_entry_or_null(&smcd_dev_list.list, struct smcd_dev, list); - if (fentry && smc_ism_is_loopback(fentry)) + if (fentry && smc_ism_is_loopback(fentry->dibs)) list_add(&smcd->list, &fentry->list); else list_add(&smcd->list, &smcd_dev_list.list); @@ -519,32 +557,46 @@ static void smcd_register_dev(struct ism_dev *ism) } mutex_unlock(&smcd_dev_list.mutex); - if (smc_pnet_is_pnetid_set(smcd->pnetid)) - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? - " (user defined)" : - ""); - else - pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", - dev_name(&ism->dev)); + if (smc_ism_is_loopback(dibs)) { + pr_warn_ratelimited("smc: adding smcd loopback device\n"); + } else { + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&ism->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&ism->dev)); + } return; } -static void smcd_unregister_dev(struct ism_dev *ism) +static void smcd_unregister_dev(struct dibs_dev *dibs) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); + struct ism_dev *ism = dibs->drv_priv; - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ism->dev)); + if (smc_ism_is_loopback(dibs)) { + pr_warn_ratelimited("smc: removing smcd loopback device\n"); + } else { + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&ism->dev)); + } smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); + if (smc_ism_is_loopback(dibs)) + smc_loopback_exit(); + kfree(smcd->conn); + kfree(smcd); } +#if IS_ENABLED(CONFIG_ISM) /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 765aa8fae6fa..04699951d03f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "smc.h" @@ -85,14 +86,14 @@ static inline bool __smc_ism_is_emulated(u16 chid) static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) { - u16 chid = smcd->ops->get_chid(smcd); + u16 chid = smcd->dibs->ops->get_fabric_id(smcd->dibs); return __smc_ism_is_emulated(chid); } -static inline bool smc_ism_is_loopback(struct smcd_dev *smcd) +static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) { - return (smcd->ops->get_chid(smcd) == 0xFFFF); + return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); } #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 1853c26fbbbb..37d8366419f7 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -35,8 +35,6 @@ static void smc_lo_generate_ids(struct smc_lo_dev *ldev) memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), sizeof(lgid->gid_ext)); - - ldev->chid = SMC_LO_RESERVED_CHID; } static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, @@ -257,11 +255,6 @@ static void smc_lo_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = ldev->local_gid.gid_ext; } -static u16 smc_lo_get_chid(struct smcd_dev *smcd) -{ - return ((struct smc_lo_dev *)smcd->priv)->chid; -} - static struct device *smc_lo_get_dev(struct smcd_dev *smcd) { return &((struct smc_lo_dev *)smcd->priv)->dev; @@ -281,72 +274,15 @@ static const struct smcd_ops lo_ops = { .signal_event = NULL, .move_data = smc_lo_move_data, .get_local_gid = smc_lo_get_local_gid, - .get_chid = smc_lo_get_chid, .get_dev = smc_lo_get_dev, }; -static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops, - int max_dmbs) -{ - struct smcd_dev *smcd; - - smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); - if (!smcd) - return NULL; - - smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), - GFP_KERNEL); - if (!smcd->conn) - goto out_smcd; - - smcd->ops = ops; - - spin_lock_init(&smcd->lock); - spin_lock_init(&smcd->lgr_lock); - INIT_LIST_HEAD(&smcd->vlan); - INIT_LIST_HEAD(&smcd->lgr_list); - init_waitqueue_head(&smcd->lgrs_deleted); - return smcd; - -out_smcd: - kfree(smcd); - return NULL; -} - -static int smcd_lo_register_dev(struct smc_lo_dev *ldev) -{ - struct smcd_dev *smcd; - - smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS); - if (!smcd) - return -ENOMEM; - ldev->smcd = smcd; - smcd->priv = ldev; - smc_ism_set_v2_capable(); - mutex_lock(&smcd_dev_list.mutex); - list_add(&smcd->list, &smcd_dev_list.list); - mutex_unlock(&smcd_dev_list.mutex); - pr_warn_ratelimited("smc: adding smcd device %s\n", - dev_name(&ldev->dev)); - return 0; -} - -static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev) +const struct smcd_ops *smc_lo_get_smcd_ops(void) { - struct smcd_dev *smcd = ldev->smcd; - - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ldev->dev)); - smcd->going_away = 1; - smc_smcd_terminate_all(smcd); - mutex_lock(&smcd_dev_list.mutex); - list_del_init(&smcd->list); - mutex_unlock(&smcd_dev_list.mutex); - kfree(smcd->conn); - kfree(smcd); + return &lo_ops; } -static int smc_lo_dev_init(struct smc_lo_dev *ldev) +static void smc_lo_dev_init(struct smc_lo_dev *ldev) { smc_lo_generate_ids(ldev); rwlock_init(&ldev->dmb_ht_lock); @@ -354,12 +290,11 @@ static int smc_lo_dev_init(struct smc_lo_dev *ldev) atomic_set(&ldev->dmb_cnt, 0); init_waitqueue_head(&ldev->ldev_release); - return smcd_lo_register_dev(ldev); + return; } static void smc_lo_dev_exit(struct smc_lo_dev *ldev) { - smcd_lo_unregister_dev(ldev); if (atomic_read(&ldev->dmb_cnt)) wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); } @@ -375,7 +310,6 @@ static void smc_lo_dev_release(struct device *dev) static int smc_lo_dev_probe(void) { struct smc_lo_dev *ldev; - int ret; ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); if (!ldev) @@ -385,17 +319,11 @@ static int smc_lo_dev_probe(void) ldev->dev.release = smc_lo_dev_release; device_initialize(&ldev->dev); dev_set_name(&ldev->dev, smc_lo_dev_name); - - ret = smc_lo_dev_init(ldev); - if (ret) - goto free_dev; + smc_lo_dev_init(ldev); lo_dev = ldev; /* global loopback device */ - return 0; -free_dev: - put_device(&ldev->dev); - return ret; + return 0; } static void smc_lo_dev_remove(void) @@ -405,11 +333,17 @@ static void smc_lo_dev_remove(void) smc_lo_dev_exit(lo_dev); put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ + lo_dev = NULL; } -int smc_loopback_init(void) +int smc_loopback_init(struct smc_lo_dev **smc_lb) { - return smc_lo_dev_probe(); + int ret; + + ret = smc_lo_dev_probe(); + if (!ret) + *smc_lb = lo_dev; + return ret; } void smc_loopback_exit(void) diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index 04dc6808d2e1..76c62526e2e5 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -17,10 +17,8 @@ #include #include -#if IS_ENABLED(CONFIG_SMC_LO) #define SMC_LO_MAX_DMBS 5000 #define SMC_LO_DMBS_HASH_BITS 12 -#define SMC_LO_RESERVED_CHID 0xFFFF struct smc_lo_dmb_node { struct hlist_node list; @@ -35,7 +33,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; struct device dev; - u16 chid; struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; @@ -44,17 +41,9 @@ struct smc_lo_dev { wait_queue_head_t ldev_release; }; -int smc_loopback_init(void); +const struct smcd_ops *smc_lo_get_smcd_ops(void); + +int smc_loopback_init(struct smc_lo_dev **smc_lb); void smc_loopback_exit(void); -#else -static inline int smc_loopback_init(void) -{ - return 0; -} - -static inline void smc_loopback_exit(void) -{ -} -#endif #endif /* _SMC_LOOPBACK_H */ -- cgit v1.2.3 From 845c334a0186a23c2ac4abfb444e499fec831b24 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Thu, 18 Sep 2025 13:04:54 +0200 Subject: dibs: Move struct device to dibs_dev Move struct device from ism_dev and smc_lo_dev to dibs_dev, and define a corresponding release function. Free ism_dev in ism_remove() and smc_lo_dev in smc_lo_dev_remove(). Replace smcd->ops->get_dev(smcd) by using dibs->dev directly. An alternative design would be to embed dibs_dev as a field in ism_dev and do the same for other dibs device driver specific structs. However that would have the disadvantage that each dibs device driver needs to allocate dibs_dev and each dibs device driver needs a different device release function. The advantage would be that ism_dev and other device driver specific structs would be covered by device reference counts. Signed-off-by: Julian Ruess Co-developed-by: Alexandra Winter Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-9-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 15 +++++++-------- drivers/dibs/dibs_main.c | 21 +++++++++++++++++++- drivers/s390/net/ism_drv.c | 40 ++++++++------------------------------ include/linux/dibs.h | 1 + include/linux/ism.h | 1 - include/net/smc.h | 1 - net/smc/smc_core.c | 4 ++-- net/smc/smc_ism.c | 46 ++++++++++++++++++-------------------------- net/smc/smc_loopback.c | 21 +------------------- net/smc/smc_loopback.h | 1 - net/smc/smc_pnet.c | 13 +++++-------- 11 files changed, 63 insertions(+), 101 deletions(-) (limited to 'include/net') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 215986ae54a4..76e479d5724b 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -15,6 +15,7 @@ #include "dibs_loopback.h" +static const char dibs_lo_dev_name[] = "lo"; /* global loopback device */ static struct dibs_lo_dev *lo_dev; @@ -27,11 +28,6 @@ static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, }; -static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) -{ - dibs_dev_del(ldev->dibs); -} - static int dibs_lo_dev_probe(void) { struct dibs_lo_dev *ldev; @@ -52,6 +48,9 @@ static int dibs_lo_dev_probe(void) dibs->drv_priv = ldev; dibs->ops = &dibs_lo_ops; + dibs->dev.parent = NULL; + dev_set_name(&dibs->dev, "%s", dibs_lo_dev_name); + ret = dibs_dev_add(dibs); if (ret) goto err_reg; @@ -60,7 +59,7 @@ static int dibs_lo_dev_probe(void) err_reg: /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); kfree(ldev); return ret; @@ -71,9 +70,9 @@ static void dibs_lo_dev_remove(void) if (!lo_dev) return; - dibs_lo_dev_exit(lo_dev); + dibs_dev_del(lo_dev->dibs); /* pairs with dibs_dev_alloc() */ - kfree(lo_dev->dibs); + put_device(&lo_dev->dibs->dev); kfree(lo_dev); lo_dev = NULL; } diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index f1cfa5849277..610b6c452211 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -88,11 +88,24 @@ int dibs_unregister_client(struct dibs_client *client) } EXPORT_SYMBOL_GPL(dibs_unregister_client); +static void dibs_dev_release(struct device *dev) +{ + struct dibs_dev *dibs; + + dibs = container_of(dev, struct dibs_dev, dev); + + kfree(dibs); +} + struct dibs_dev *dibs_dev_alloc(void) { struct dibs_dev *dibs; dibs = kzalloc(sizeof(*dibs), GFP_KERNEL); + if (!dibs) + return dibs; + dibs->dev.release = dibs_dev_release; + device_initialize(&dibs->dev); return dibs; } @@ -100,7 +113,11 @@ EXPORT_SYMBOL_GPL(dibs_dev_alloc); int dibs_dev_add(struct dibs_dev *dibs) { - int i; + int i, ret; + + ret = device_add(&dibs->dev); + if (ret) + return ret; mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); @@ -129,6 +146,8 @@ void dibs_dev_del(struct dibs_dev *dibs) mutex_unlock(&clients_lock); list_del_init(&dibs->list); mutex_unlock(&dibs_dev_list.mutex); + + device_del(&dibs->dev); } EXPORT_SYMBOL_GPL(dibs_dev_del); diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2bd8f64ebb56..4096ea9faa7e 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -602,15 +602,6 @@ out: return ret; } -static void ism_dev_release(struct device *dev) -{ - struct ism_dev *ism; - - ism = container_of(dev, struct ism_dev, dev); - - kfree(ism); -} - static void ism_dev_exit(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; @@ -649,17 +640,10 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) spin_lock_init(&ism->cmd_lock); dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; - ism->dev.parent = &pdev->dev; - ism->dev.release = ism_dev_release; - device_initialize(&ism->dev); - dev_set_name(&ism->dev, "%s", dev_name(&pdev->dev)); - ret = device_add(&ism->dev); - if (ret) - goto err_dev; ret = pci_enable_device_mem(pdev); if (ret) - goto err; + goto err_dev; ret = pci_request_mem_regions(pdev, DRV_NAME); if (ret) @@ -687,6 +671,9 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (ret) goto err_dibs; + dibs->dev.parent = &pdev->dev; + dev_set_name(&dibs->dev, "%s", dev_name(&pdev->dev)); + ret = dibs_dev_add(dibs); if (ret) goto err_ism; @@ -697,16 +684,14 @@ err_ism: ism_dev_exit(ism); err_dibs: /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); err_resource: pci_release_mem_regions(pdev); err_disable: pci_disable_device(pdev); -err: - device_del(&ism->dev); err_dev: dev_set_drvdata(&pdev->dev, NULL); - put_device(&ism->dev); + kfree(ism); return ret; } @@ -719,13 +704,12 @@ static void ism_remove(struct pci_dev *pdev) dibs_dev_del(dibs); ism_dev_exit(ism); /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); pci_release_mem_regions(pdev); pci_disable_device(pdev); - device_del(&ism->dev); dev_set_drvdata(&pdev->dev, NULL); - put_device(&ism->dev); + kfree(ism); } static struct pci_driver ism_driver = { @@ -866,13 +850,6 @@ static void smcd_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = 0; } -static inline struct device *smcd_get_dev(struct smcd_dev *dev) -{ - struct ism_dev *ism = dev->priv; - - return &ism->dev; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, @@ -885,7 +862,6 @@ static const struct smcd_ops ism_smcd_ops = { .move_data = smcd_move, .supports_v2 = smcd_supports_v2, .get_local_gid = smcd_get_local_gid, - .get_dev = smcd_get_dev, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 805ab33271b5..793c6e1ece0f 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -135,6 +135,7 @@ struct dibs_dev_ops { struct dibs_dev { struct list_head list; + struct device dev; /* To be filled by device driver, before calling dibs_dev_add(): */ const struct dibs_dev_ops *ops; /* priv pointer for device driver */ diff --git a/include/linux/ism.h b/include/linux/ism.h index c818a25996db..84f1afb3dded 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -42,7 +42,6 @@ struct ism_dev { struct ism_eq *ieq; dma_addr_t ieq_dma_addr; - struct device dev; u64 local_gid; int ieq_idx; diff --git a/include/net/smc.h b/include/net/smc.h index e271891b85e6..05faac83371e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -63,7 +63,6 @@ struct smcd_ops { unsigned int size); int (*supports_v2)(void); void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); - struct device* (*get_dev)(struct smcd_dev *dev); /* optional operations */ int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a9e80f44307d..42ab0795d563 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -924,7 +924,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) if (ini->is_smcd) { /* SMC-D specific settings */ smcd = ini->ism_dev[ini->ism_selected]; - get_device(smcd->ops->get_dev(smcd)); + get_device(&smcd->dibs->dev); lgr->peer_gid.gid = ini->ism_peer_gid[ini->ism_selected].gid; lgr->peer_gid.gid_ext = @@ -1474,7 +1474,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(lgr->smcd->ops->get_dev(lgr->smcd)); + put_device(&lgr->smcd->dibs->dev); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 415f03910c91..6a6e7c9641e8 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -303,12 +303,12 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; + struct dibs_dev *dibs; struct nlattr *attrs; - struct ism_dev *ism; int use_cnt = 0; void *nlh; - ism = smcd->priv; + dibs = smcd->dibs; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); @@ -323,7 +323,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(ism->pdev, &smc_pci_dev); + smc_set_pci_values(to_pci_dev(dibs->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -509,14 +509,14 @@ static void smcd_register_dev(struct dibs_dev *dibs) if (smc_ism_is_loopback(dibs)) { ops = smc_lo_get_smcd_ops(); - smcd = smcd_alloc_dev(dev_name(&smc_lo->dev), ops, + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, SMC_LO_MAX_DMBS); } else { ism = dibs->drv_priv; #if IS_ENABLED(CONFIG_ISM) ops = ism_get_smcd_ops(); #endif - smcd = smcd_alloc_dev(dev_name(&ism->pdev->dev), ops, + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, ISM_NR_DMBS); } if (!smcd) @@ -534,10 +534,11 @@ static void smcd_register_dev(struct dibs_dev *dibs) ism_set_priv(ism, &smc_ism_client, smcd); smcd->client = &smc_ism_client; #endif - if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); } + if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); + if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); @@ -557,33 +558,24 @@ static void smcd_register_dev(struct dibs_dev *dibs) } mutex_unlock(&smcd_dev_list.mutex); - if (smc_ism_is_loopback(dibs)) { - pr_warn_ratelimited("smc: adding smcd loopback device\n"); - } else { - if (smc_pnet_is_pnetid_set(smcd->pnetid)) - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? - " (user defined)" : - ""); - else - pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", - dev_name(&ism->dev)); - } + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&dibs->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&dibs->dev)); return; } static void smcd_unregister_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); - struct ism_dev *ism = dibs->drv_priv; - if (smc_ism_is_loopback(dibs)) { - pr_warn_ratelimited("smc: removing smcd loopback device\n"); - } else { - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ism->dev)); - } + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&dibs->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 37d8366419f7..262d0d0df4d0 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -23,7 +23,6 @@ #define SMC_LO_SUPPORT_NOCOPY 0x1 #define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) -static const char smc_lo_dev_name[] = "loopback-ism"; static struct smc_lo_dev *lo_dev; static void smc_lo_generate_ids(struct smc_lo_dev *ldev) @@ -255,11 +254,6 @@ static void smc_lo_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = ldev->local_gid.gid_ext; } -static struct device *smc_lo_get_dev(struct smcd_dev *smcd) -{ - return &((struct smc_lo_dev *)smcd->priv)->dev; -} - static const struct smcd_ops lo_ops = { .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, @@ -274,7 +268,6 @@ static const struct smcd_ops lo_ops = { .signal_event = NULL, .move_data = smc_lo_move_data, .get_local_gid = smc_lo_get_local_gid, - .get_dev = smc_lo_get_dev, }; const struct smcd_ops *smc_lo_get_smcd_ops(void) @@ -299,14 +292,6 @@ static void smc_lo_dev_exit(struct smc_lo_dev *ldev) wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); } -static void smc_lo_dev_release(struct device *dev) -{ - struct smc_lo_dev *ldev = - container_of(dev, struct smc_lo_dev, dev); - - kfree(ldev); -} - static int smc_lo_dev_probe(void) { struct smc_lo_dev *ldev; @@ -315,10 +300,6 @@ static int smc_lo_dev_probe(void) if (!ldev) return -ENOMEM; - ldev->dev.parent = NULL; - ldev->dev.release = smc_lo_dev_release; - device_initialize(&ldev->dev); - dev_set_name(&ldev->dev, smc_lo_dev_name); smc_lo_dev_init(ldev); lo_dev = ldev; /* global loopback device */ @@ -332,7 +313,7 @@ static void smc_lo_dev_remove(void) return; smc_lo_dev_exit(lo_dev); - put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ + kfree(lo_dev); lo_dev = NULL; } diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index 76c62526e2e5..a033bf10890a 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -32,7 +32,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; - struct device dev; struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 7225b5fa17a6..d0df7f2b03aa 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -169,7 +169,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", - dev_name(smcd->ops->get_dev(smcd)), + dev_name(&smcd->dibs->dev), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; @@ -332,7 +332,7 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), + if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name, IB_DEVICE_NAME_MAX - 1)) goto out; } @@ -413,7 +413,6 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; - struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ @@ -431,10 +430,8 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { - dev = smcd->ops->get_dev(smcd); - pr_warn_ratelimited("smc: smcd device %s " - "applied user defined pnetid " - "%.16s\n", dev_name(dev), + pr_warn_ratelimited("smc: smcd device %s applied user defined pnetid %.16s\n", + dev_name(&smcd->dibs->dev), smcd->pnetid); } } @@ -1193,7 +1190,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); + const char *ib_name = dev_name(&smcddev->dibs->dev); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; -- cgit v1.2.3 From 05e68d8dedf34f270cc3769ffe7f0ed413f23add Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:56 +0200 Subject: dibs: Local gid for dibs devices Define a uuid_t GID attribute to identify a dibs device. SMC uses 64 Bit and 128 Bit Global Identifiers (GIDs) per device, that need to be sent via the SMC protocol. Because the smc code uses integers, network endianness and host endianness need to be considered. Avoid this in the dibs layer by using uuid_t byte arrays. Future patches could change SMC to use uuid_t. For now conversion helper functions are introduced. ISM devices provide 64 Bit GIDs. Map them to dibs uuid_t GIDs like this: _________________________________________ | 64 Bit ISM-vPCI GID | 00000000_00000000 | ----------------------------------------- If interpreted as UUID [1], this would be interpreted as the UIID variant, that is reserved for NCS backward compatibility. So it will not collide with UUIDs that were generated according to the standard. smc_loopback already uses version 4 UUIDs as 128 Bit GIDs, move that to dibs loopback. A temporary change to smc_lo_query_rgid() is required, that will be moved to dibs_loopback with a follow-on patch. Provide gid of a dibs device as sysfs read-only attribute. Link: https://datatracker.ietf.org/doc/html/rfc4122 [1] Signed-off-by: Alexandra Winter Reviewed-by: Julian Ruess Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-11-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 1 + drivers/dibs/dibs_main.c | 12 ++++++++++++ drivers/s390/net/ism.h | 9 +++++++++ drivers/s390/net/ism_drv.c | 30 +++++++++--------------------- include/linux/dibs.h | 3 +++ include/linux/ism.h | 1 - include/net/smc.h | 1 - net/smc/smc_clc.c | 6 +++--- net/smc/smc_core.c | 2 +- net/smc/smc_diag.c | 2 +- net/smc/smc_ism.h | 22 ++++++++++++++++++++++ net/smc/smc_loopback.c | 29 ++++------------------------- net/smc/smc_loopback.h | 1 - 13 files changed, 65 insertions(+), 54 deletions(-) (limited to 'include/net') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 76e479d5724b..d7e6fa5e90f3 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -46,6 +46,7 @@ static int dibs_lo_dev_probe(void) ldev->dibs = dibs; dibs->drv_priv = ldev; + uuid_gen(&dibs->gid); dibs->ops = &dibs_lo_ops; dibs->dev.parent = NULL; diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index b3f21805aa59..f20ed0594a51 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -114,6 +114,17 @@ struct dibs_dev *dibs_dev_alloc(void) } EXPORT_SYMBOL_GPL(dibs_dev_alloc); +static ssize_t gid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct dibs_dev *dibs; + + dibs = container_of(dev, struct dibs_dev, dev); + + return sysfs_emit(buf, "%pUb\n", &dibs->gid); +} +static DEVICE_ATTR_RO(gid); + static ssize_t fabric_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -128,6 +139,7 @@ static ssize_t fabric_id_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_RO(fabric_id); static struct attribute *dibs_dev_attrs[] = { + &dev_attr_gid.attr, &dev_attr_fabric_id.attr, NULL, }; diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 3078779fa71e..1b9fa14da20c 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -67,6 +67,15 @@ union ism_reg_ieq { } response; } __aligned(16); +/* ISM-vPCI devices provide 64 Bit GIDs + * Map them to ISM UUID GIDs like this: + * _________________________________________ + * | 64 Bit ISM-vPCI GID | 00000000_00000000 | + * ----------------------------------------- + * This will be interpreted as a UIID variant, that is reserved + * for NCS backward compatibility. So it will not collide with + * proper UUIDs. + */ union ism_read_gid { struct { struct ism_req_hdr hdr; diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index ab1d61eb3e3b..e58c55fb03c2 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -272,8 +272,9 @@ static int unregister_ieq(struct ism_dev *ism) return 0; } -static int ism_read_local_gid(struct ism_dev *ism) +static int ism_read_local_gid(struct dibs_dev *dibs) { + struct ism_dev *ism = dibs->drv_priv; union ism_read_gid cmd; int ret; @@ -285,7 +286,8 @@ static int ism_read_local_gid(struct ism_dev *ism) if (ret) goto out; - ism->local_gid = cmd.response.gid; + memset(&dibs->gid, 0, sizeof(dibs->gid)); + memcpy(&dibs->gid, &cmd.response.gid, sizeof(cmd.response.gid)); out: return ret; } @@ -563,10 +565,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - ret = ism_read_local_gid(ism); - if (ret) - goto unreg_ieq; - if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID)) /* hardware is V2 capable */ ism_v2_capable = true; @@ -588,8 +586,6 @@ static int ism_dev_init(struct ism_dev *ism) query_info(ism); return 0; -unreg_ieq: - unregister_ieq(ism); unreg_sba: unregister_sba(ism); free_irq: @@ -672,6 +668,11 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (ret) goto err_dibs; + /* after ism_dev_init() we can call ism function to set gid */ + ret = ism_read_local_gid(dibs); + if (ret) + goto err_ism; + dibs->dev.parent = &pdev->dev; zdev = to_zpci(pdev); @@ -841,18 +842,6 @@ static int smcd_supports_v2(void) return ism_v2_capable; } -static u64 ism_get_local_gid(struct ism_dev *ism) -{ - return ism->local_gid; -} - -static void smcd_get_local_gid(struct smcd_dev *smcd, - struct smcd_gid *smcd_gid) -{ - smcd_gid->gid = ism_get_local_gid(smcd->priv); - smcd_gid->gid_ext = 0; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, @@ -864,7 +853,6 @@ static const struct smcd_ops ism_smcd_ops = { .signal_event = smcd_signal_ieq, .move_data = smcd_move, .supports_v2 = smcd_supports_v2, - .get_local_gid = smcd_get_local_gid, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 793c6e1ece0f..904f37505c27 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -10,6 +10,8 @@ #define _DIBS_H #include +#include + /* DIBS - Direct Internal Buffer Sharing - concept * ----------------------------------------------- * In the case of multiple system sharing the same hardware, dibs fabrics can @@ -138,6 +140,7 @@ struct dibs_dev { struct device dev; /* To be filled by device driver, before calling dibs_dev_add(): */ const struct dibs_dev_ops *ops; + uuid_t gid; /* priv pointer for device driver */ void *drv_priv; diff --git a/include/linux/ism.h b/include/linux/ism.h index 84f1afb3dded..a926dd61b5a1 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -42,7 +42,6 @@ struct ism_dev { struct ism_eq *ieq; dma_addr_t ieq_dma_addr; - u64 local_gid; int ieq_idx; struct ism_client *subs[MAX_CLIENTS]; diff --git a/include/net/smc.h b/include/net/smc.h index 05faac83371e..9cb8385bbc6e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -62,7 +62,6 @@ struct smcd_ops { bool sf, unsigned int offset, void *data, unsigned int size); int (*supports_v2)(void); - void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); /* optional operations */ int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 09745baa1017..157aace169d4 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -916,7 +916,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) /* add SMC-D specifics */ if (ini->ism_dev[0]) { smcd = ini->ism_dev[0]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); pclc_smcd->ism.gid = htonll(smcd_gid.gid); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); @@ -966,7 +966,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { smcd = ini->ism_dev[i]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); gidchids[entry].gid = htonll(smcd_gid.gid); @@ -1059,7 +1059,7 @@ smcd_clc_prep_confirm_accept(struct smc_connection *conn, /* SMC-D specific settings */ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); clc->hdr.typev1 = SMC_TYPE_D; clc->d0.gid = htonll(smcd_gid.gid); clc->d0.token = htonll(conn->rmb_desc->token); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 42ab0795d563..be0c2da83d2b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -555,7 +555,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 8ed2f6689b01..bf0beaa23bdb 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -175,7 +175,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid.gid; dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); dinfo.my_gid = smcd_gid.gid; dinfo.my_gid_ext = smcd_gid.gid_ext; dinfo.token = conn->rmb_desc->token; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 04699951d03f..139e99da2c9f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -96,4 +96,26 @@ static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); } +static inline void copy_to_smcdgid(struct smcd_gid *sgid, uuid_t *dibs_gid) +{ + __be64 temp; + + memcpy(&temp, dibs_gid, sizeof(sgid->gid)); + sgid->gid = ntohll(temp); + memcpy(&temp, (uint8_t *)dibs_gid + sizeof(sgid->gid), + sizeof(sgid->gid_ext)); + sgid->gid_ext = ntohll(temp); +} + +static inline void copy_to_dibsgid(uuid_t *dibs_gid, struct smcd_gid *sgid) +{ + __be64 temp; + + temp = htonll(sgid->gid); + memcpy(dibs_gid, &temp, sizeof(sgid->gid)); + temp = htonll(sgid->gid_ext); + memcpy((uint8_t *)dibs_gid + sizeof(sgid->gid), &temp, + sizeof(sgid->gid_ext)); +} + #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 262d0d0df4d0..454d9d6a6e8f 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -13,6 +13,7 @@ #include #include +#include #include #include "smc_cdc.h" @@ -25,25 +26,14 @@ static struct smc_lo_dev *lo_dev; -static void smc_lo_generate_ids(struct smc_lo_dev *ldev) -{ - struct smcd_gid *lgid = &ldev->local_gid; - uuid_t uuid; - - uuid_gen(&uuid); - memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); - memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), - sizeof(lgid->gid_ext)); -} - static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, u32 vid_valid, u32 vid) { - struct smc_lo_dev *ldev = smcd->priv; + uuid_t temp; + copy_to_dibsgid(&temp, rgid); /* rgid should be the same as lgid */ - if (!ldev || rgid->gid != ldev->local_gid.gid || - rgid->gid_ext != ldev->local_gid.gid_ext) + if (!uuid_equal(&temp, &smcd->dibs->gid)) return -ENETUNREACH; return 0; } @@ -245,15 +235,6 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, return 0; } -static void smc_lo_get_local_gid(struct smcd_dev *smcd, - struct smcd_gid *smcd_gid) -{ - struct smc_lo_dev *ldev = smcd->priv; - - smcd_gid->gid = ldev->local_gid.gid; - smcd_gid->gid_ext = ldev->local_gid.gid_ext; -} - static const struct smcd_ops lo_ops = { .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, @@ -267,7 +248,6 @@ static const struct smcd_ops lo_ops = { .reset_vlan_required = NULL, .signal_event = NULL, .move_data = smc_lo_move_data, - .get_local_gid = smc_lo_get_local_gid, }; const struct smcd_ops *smc_lo_get_smcd_ops(void) @@ -277,7 +257,6 @@ const struct smcd_ops *smc_lo_get_smcd_ops(void) static void smc_lo_dev_init(struct smc_lo_dev *ldev) { - smc_lo_generate_ids(ldev); rwlock_init(&ldev->dmb_ht_lock); hash_init(ldev->dmb_ht); atomic_set(&ldev->dmb_cnt, 0); diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index a033bf10890a..33bb96ec8b77 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -32,7 +32,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; - struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); -- cgit v1.2.3 From 92a0f7bb081dde6e88368816b8ba51352ddabb1d Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:57 +0200 Subject: dibs: Move vlan support to dibs_dev_ops It can be debated how much benefit definition of vlan ids for dibs devices brings, as the dmbs are accessible only by a single peer anyhow. But ism provides vlan support and smcd exploits it, so move it to dibs layer as an optional feature. smcd_loopback simply ignores all vlan settings, do the same in dibs_loopback. SMC-D and ISM have a method to use the invalid VLAN ID 1FFF (ISM_RESERVED_VLANID), to indicate that both communication peers support routable SMC-Dv2. Tolerate it in dibs, but move it to SMC only. Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-12-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/s390/net/ism_drv.c | 47 ++++++---------------------------------------- include/linux/dibs.h | 19 +++++++++++++++++++ include/net/smc.h | 5 ----- net/smc/smc_ism.c | 14 +++++++++----- net/smc/smc_loopback.c | 5 ----- 5 files changed, 34 insertions(+), 56 deletions(-) (limited to 'include/net') diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index e58c55fb03c2..ed4c28ca355b 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -36,7 +36,6 @@ static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ /* a list for fast mapping */ static u8 max_client; static DEFINE_MUTEX(clients_lock); -static bool ism_v2_capable; struct ism_dev_list { struct list_head list; struct mutex mutex; /* protects ism device list */ @@ -409,8 +408,9 @@ out: } EXPORT_SYMBOL_GPL(ism_unregister_dmb); -static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) +static int ism_add_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { + struct ism_dev *ism = dibs->drv_priv; union ism_set_vlan_id cmd; memset(&cmd, 0, sizeof(cmd)); @@ -422,8 +422,9 @@ static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) return ism_cmd(ism, &cmd); } -static int ism_del_vlan_id(struct ism_dev *ism, u64 vlan_id) +static int ism_del_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { + struct ism_dev *ism = dibs->drv_priv; union ism_set_vlan_id cmd; memset(&cmd, 0, sizeof(cmd)); @@ -536,6 +537,8 @@ static irqreturn_t ism_handle_irq(int irq, void *data) static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, + .add_vlan_id = ism_add_vlan_id, + .del_vlan_id = ism_del_vlan_id, }; static int ism_dev_init(struct ism_dev *ism) @@ -565,12 +568,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID)) - /* hardware is V2 capable */ - ism_v2_capable = true; - else - ism_v2_capable = false; - mutex_lock(&ism_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { @@ -611,8 +608,6 @@ static void ism_dev_exit(struct ism_dev *ism) mutex_lock(&ism_dev_list.mutex); - if (ism_v2_capable) - ism_del_vlan_id(ism, ISM_RESERVED_VLANID); unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); @@ -786,26 +781,6 @@ static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); } -static int smcd_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id) -{ - return ism_add_vlan_id(smcd->priv, vlan_id); -} - -static int smcd_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id) -{ - return ism_del_vlan_id(smcd->priv, vlan_id); -} - -static int smcd_set_vlan_required(struct smcd_dev *smcd) -{ - return ism_cmd_simple(smcd->priv, ISM_SET_VLAN); -} - -static int smcd_reset_vlan_required(struct smcd_dev *smcd) -{ - return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN); -} - static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, u32 event_code, u64 info) { @@ -837,22 +812,12 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size); } -static int smcd_supports_v2(void) -{ - return ism_v2_capable; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, - .add_vlan_id = smcd_add_vlan_id, - .del_vlan_id = smcd_del_vlan_id, - .set_vlan_required = smcd_set_vlan_required, - .reset_vlan_required = smcd_reset_vlan_required, .signal_event = smcd_signal_ieq, .move_data = smcd_move, - .supports_v2 = smcd_supports_v2, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 904f37505c27..166148fb8d76 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -133,6 +133,25 @@ struct dibs_dev_ops { * Return: 2 byte dibs fabric id */ u16 (*get_fabric_id)(struct dibs_dev *dev); + /** + * add_vlan_id() - add dibs device to vlan (optional, deprecated) + * @dev: dibs device + * @vlan_id: vlan id + * + * In order to write into a vlan-tagged dmb, the remote device needs + * to belong to the this vlan. A device can belong to more than 1 vlan. + * Any device can access an untagged dmb. + * Deprecated, only supported for backwards compatibility. + * Return: zero on success + */ + int (*add_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * del_vlan_id() - remove dibs device from vlan (optional, deprecated) + * @dev: dibs device + * @vlan_id: vlan id + * Return: zero on success + */ + int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); }; struct dibs_dev { diff --git a/include/net/smc.h b/include/net/smc.h index 9cb8385bbc6e..51b4aefc106a 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -61,13 +61,8 @@ struct smcd_ops { int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); - int (*supports_v2)(void); /* optional operations */ - int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*set_vlan_required)(struct smcd_dev *dev); - int (*reset_vlan_required)(struct smcd_dev *dev); int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, u32 trigger_irq, u32 event_code, u64 info); int (*support_dmb_nocopy)(struct smcd_dev *dev); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 6a6e7c9641e8..5118441bed18 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -140,7 +140,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->add_vlan_id) + if (!smcd->dibs->ops->add_vlan_id) return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ @@ -163,7 +163,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ - if (smcd->ops->add_vlan_id(smcd, vlanid)) { + if (smcd->dibs->ops->add_vlan_id(smcd->dibs, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; @@ -187,7 +187,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->del_vlan_id) + if (!smcd->dibs->ops->del_vlan_id) return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); @@ -205,7 +205,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) } /* Found and the last reference just gone */ - if (smcd->ops->del_vlan_id(smcd, vlanid)) + if (smcd->dibs->ops->del_vlan_id(smcd->dibs, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); @@ -539,8 +539,12 @@ static void smcd_register_dev(struct dibs_dev *dibs) if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); - if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) + if (smc_ism_is_loopback(dibs) || + (dibs->ops->add_vlan_id && + !dibs->ops->add_vlan_id(dibs, ISM_RESERVED_VLANID))) { smc_ism_set_v2_capable(); + } + mutex_lock(&smcd_dev_list.mutex); /* sort list: * - devices without pnetid before devices with pnetid; diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 454d9d6a6e8f..982a19430313 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -20,7 +20,6 @@ #include "smc_ism.h" #include "smc_loopback.h" -#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */ #define SMC_LO_SUPPORT_NOCOPY 0x1 #define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) @@ -242,10 +241,6 @@ static const struct smcd_ops lo_ops = { .support_dmb_nocopy = smc_lo_support_dmb_nocopy, .attach_dmb = smc_lo_attach_dmb, .detach_dmb = smc_lo_detach_dmb, - .add_vlan_id = NULL, - .del_vlan_id = NULL, - .set_vlan_required = NULL, - .reset_vlan_required = NULL, .signal_event = NULL, .move_data = smc_lo_move_data, }; -- cgit v1.2.3 From 719c3b67bb7ea95bb8158b03c75641c8fc8f94a0 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:58 +0200 Subject: dibs: Move query_remote_gid() to dibs_dev_ops Provide the dibs_dev_ops->query_remote_gid() in ism and dibs_loopback dibs_devices. And call it in smc dibs_client. Reviewed-by: Julian Ruess Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-13-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 10 ++++++++++ drivers/s390/net/ism_drv.c | 41 ++++++++++++++++++----------------------- include/linux/dibs.h | 14 ++++++++++++++ include/net/smc.h | 2 -- net/smc/smc_ism.c | 8 ++++++-- net/smc/smc_loopback.c | 13 ------------- 6 files changed, 48 insertions(+), 40 deletions(-) (limited to 'include/net') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index d7e6fa5e90f3..6b53e626a6d1 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -24,8 +24,18 @@ static u16 dibs_lo_get_fabric_id(struct dibs_dev *dibs) return DIBS_LOOPBACK_FABRIC; } +static int dibs_lo_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, + u32 vid_valid, u32 vid) +{ + /* rgid should be the same as lgid */ + if (!uuid_equal(rgid, &dibs->gid)) + return -ENETUNREACH; + return 0; +} + static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, + .query_remote_gid = dibs_lo_query_rgid, }; static int dibs_lo_dev_probe(void) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index ed4c28ca355b..121b3a2be760 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -291,6 +291,23 @@ out: return ret; } +static int ism_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, + u32 vid_valid, u32 vid) +{ + struct ism_dev *ism = dibs->drv_priv; + union ism_query_rgid cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_QUERY_RGID; + cmd.request.hdr.len = sizeof(cmd.request); + + memcpy(&cmd.request.rgid, rgid, sizeof(cmd.request.rgid)); + cmd.request.vlan_valid = vid_valid; + cmd.request.vlan_id = vid; + + return ism_cmd(ism, &cmd); +} + static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); @@ -537,6 +554,7 @@ static irqreturn_t ism_handle_irq(int irq, void *data) static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, + .query_remote_gid = ism_query_rgid, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, }; @@ -748,28 +766,6 @@ module_exit(ism_exit); /*************************** SMC-D Implementation *****************************/ #if IS_ENABLED(CONFIG_SMC) -static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, - u32 vid) -{ - union ism_query_rgid cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_QUERY_RGID; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.vlan_valid = vid_valid; - cmd.request.vlan_id = vid; - - return ism_cmd(ism, &cmd); -} - -static int smcd_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 vid_valid, u32 vid) -{ - return ism_query_rgid(smcd->priv, rgid->gid, vid_valid, vid); -} - static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, void *client) { @@ -813,7 +809,6 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, } static const struct smcd_ops ism_smcd_ops = { - .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, .signal_event = smcd_signal_ieq, diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 166148fb8d76..c75a40fe3039 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -133,6 +133,20 @@ struct dibs_dev_ops { * Return: 2 byte dibs fabric id */ u16 (*get_fabric_id)(struct dibs_dev *dev); + /** + * query_remote_gid() + * @dev: local dibs device + * @rgid: gid of remote dibs device + * @vid_valid: if zero, vid will be ignored; + * deprecated, ignored if device does not support vlan + * @vid: VLAN id; deprecated, ignored if device does not support vlan + * + * Query whether a remote dibs device is reachable via this local device + * and this vlan id. + * Return: 0 if remote gid is reachable. + */ + int (*query_remote_gid)(struct dibs_dev *dev, const uuid_t *rgid, + u32 vid_valid, u32 vid); /** * add_vlan_id() - add dibs device to vlan (optional, deprecated) * @dev: dibs device diff --git a/include/net/smc.h b/include/net/smc.h index 51b4aefc106a..5bd135fb4d49 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -53,8 +53,6 @@ struct smcd_gid { }; struct smcd_ops { - int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 vid_valid, u32 vid); int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, void *client); int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 5118441bed18..d20d00b46825 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -77,8 +77,12 @@ static void smc_ism_create_system_eid(void) int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { - return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, - vlan_id); + struct dibs_dev *dibs = smcd->dibs; + uuid_t ism_rgid; + + copy_to_dibsgid(&ism_rgid, peer_gid); + return dibs->ops->query_remote_gid(dibs, &ism_rgid, vlan_id ? 1 : 0, + vlan_id); } void smc_ism_get_system_eid(u8 **eid) diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 982a19430313..52cba01cb209 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -25,18 +25,6 @@ static struct smc_lo_dev *lo_dev; -static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 vid_valid, u32 vid) -{ - uuid_t temp; - - copy_to_dibsgid(&temp, rgid); - /* rgid should be the same as lgid */ - if (!uuid_equal(&temp, &smcd->dibs->gid)) - return -ENETUNREACH; - return 0; -} - static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, void *client_priv) { @@ -235,7 +223,6 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, } static const struct smcd_ops lo_ops = { - .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, .unregister_dmb = smc_lo_unregister_dmb, .support_dmb_nocopy = smc_lo_support_dmb_nocopy, -- cgit v1.2.3 From cc21191b584c6f7836b0f10774f8278b7cbfba10 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:59 +0200 Subject: dibs: Move data path to dibs layer Use struct dibs_dmb instead of struct smc_dmb and move the corresponding client tables to dibs_dev. Leave driver specific implementation details like sba in the device drivers. Register and unregister dmbs via dibs_dev_ops. A dmb is dedicated to a single client, but a dibs device can have dmbs for more than one client. Trigger dibs clients via dibs_client_ops->handle_irq(), when data is received into a dmb. For dibs_loopback replace scheduling an smcd receive tasklet with calling dibs_client_ops->handle_irq(). For loopback devices attach_dmb(), detach_dmb() and move_data() need to access the dmb tables, so move those to dibs_dev_ops in this patch as well. Remove remaining definitions of smc_loopback as they are no longer required, now that everything is in dibs_loopback. Note that struct ism_client and struct ism_dev are still required in smc until a follow-on patch moves event handling to dibs. (Loopback does not use events). Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-14-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 257 +++++++++++++++++++++++++++++++++++++ drivers/dibs/dibs_loopback.h | 19 +++ drivers/dibs/dibs_main.c | 56 ++++++++- drivers/s390/net/ism_drv.c | 121 ++++++++---------- include/linux/dibs.h | 177 ++++++++++++++++++++++++++ include/linux/ism.h | 23 ---- include/net/smc.h | 22 ---- net/smc/Makefile | 1 - net/smc/smc_ism.c | 70 +++++------ net/smc/smc_ism.h | 4 +- net/smc/smc_loopback.c | 294 ------------------------------------------- net/smc/smc_loopback.h | 47 ------- 12 files changed, 591 insertions(+), 500 deletions(-) delete mode 100644 net/smc/smc_loopback.c delete mode 100644 net/smc/smc_loopback.h (limited to 'include/net') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 6b53e626a6d1..b3fd0f8100d4 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -9,12 +9,18 @@ * */ +#include +#include #include #include +#include #include #include "dibs_loopback.h" +#define DIBS_LO_SUPPORT_NOCOPY 0x1 +#define DIBS_DMA_ADDR_INVALID (~(dma_addr_t)0) + static const char dibs_lo_dev_name[] = "lo"; /* global loopback device */ static struct dibs_lo_dev *lo_dev; @@ -33,11 +39,259 @@ static int dibs_lo_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, return 0; } +static int dibs_lo_max_dmbs(void) +{ + return DIBS_LO_MAX_DMBS; +} + +static int dibs_lo_register_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb, + struct dibs_client *client) +{ + struct dibs_lo_dmb_node *dmb_node, *tmp_node; + struct dibs_lo_dev *ldev; + unsigned long flags; + int sba_idx, rc; + + ldev = dibs->drv_priv; + sba_idx = dmb->idx; + /* check space for new dmb */ + for_each_clear_bit(sba_idx, ldev->sba_idx_mask, DIBS_LO_MAX_DMBS) { + if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) + break; + } + if (sba_idx == DIBS_LO_MAX_DMBS) + return -ENOSPC; + + dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); + if (!dmb_node) { + rc = -ENOMEM; + goto err_bit; + } + + dmb_node->sba_idx = sba_idx; + dmb_node->len = dmb->dmb_len; + dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!dmb_node->cpu_addr) { + rc = -ENOMEM; + goto err_node; + } + dmb_node->dma_addr = DIBS_DMA_ADDR_INVALID; + refcount_set(&dmb_node->refcnt, 1); + +again: + /* add new dmb into hash table */ + get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); + write_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { + if (tmp_node->token == dmb_node->token) { + write_unlock_bh(&ldev->dmb_ht_lock); + goto again; + } + } + hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); + write_unlock_bh(&ldev->dmb_ht_lock); + atomic_inc(&ldev->dmb_cnt); + + dmb->idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[sba_idx] = client->id; + spin_unlock_irqrestore(&dibs->lock, flags); + + return 0; + +err_node: + kfree(dmb_node); +err_bit: + clear_bit(sba_idx, ldev->sba_idx_mask); + return rc; +} + +static void __dibs_lo_unregister_dmb(struct dibs_lo_dev *ldev, + struct dibs_lo_dmb_node *dmb_node) +{ + /* remove dmb from hash table */ + write_lock_bh(&ldev->dmb_ht_lock); + hash_del(&dmb_node->list); + write_unlock_bh(&ldev->dmb_ht_lock); + + clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); + kfree(dmb_node->cpu_addr); + kfree(dmb_node); + + if (atomic_dec_and_test(&ldev->dmb_cnt)) + wake_up(&ldev->ldev_release); +} + +static int dibs_lo_unregister_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + unsigned long flags; + + ldev = dibs->drv_priv; + + /* find dmb from hash table */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + read_unlock_bh(&ldev->dmb_ht_lock); + if (!dmb_node) + return -EINVAL; + + if (refcount_dec_and_test(&dmb_node->refcnt)) { + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb_node->sba_idx] = NO_DIBS_CLIENT; + spin_unlock_irqrestore(&dibs->lock, flags); + + __dibs_lo_unregister_dmb(ldev, dmb_node); + } + return 0; +} + +static int dibs_lo_support_dmb_nocopy(struct dibs_dev *dibs) +{ + return DIBS_LO_SUPPORT_NOCOPY; +} + +static int dibs_lo_attach_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + + ldev = dibs->drv_priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!refcount_inc_not_zero(&dmb_node->refcnt)) + /* the dmb is being unregistered, but has + * not been removed from the hash table. + */ + return -EINVAL; + + /* provide dmb information */ + dmb->idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + return 0; +} + +static int dibs_lo_detach_dmb(struct dibs_dev *dibs, u64 token) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + + ldev = dibs->drv_priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { + if (tmp_node->token == token) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (refcount_dec_and_test(&dmb_node->refcnt)) + __dibs_lo_unregister_dmb(ldev, dmb_node); + return 0; +} + +static int dibs_lo_move_data(struct dibs_dev *dibs, u64 dmb_tok, + unsigned int idx, bool sf, unsigned int offset, + void *data, unsigned int size) +{ + struct dibs_lo_dmb_node *rmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + u16 s_mask; + u8 client_id; + u32 sba_idx; + + ldev = dibs->drv_priv; + + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { + if (tmp_node->token == dmb_tok) { + rmb_node = tmp_node; + break; + } + } + if (!rmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + memcpy((char *)rmb_node->cpu_addr + offset, data, size); + sba_idx = rmb_node->sba_idx; + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!sf) + return 0; + + spin_lock(&dibs->lock); + client_id = dibs->dmb_clientid_arr[sba_idx]; + s_mask = ror16(0x1000, idx); + if (likely(client_id != NO_DIBS_CLIENT && dibs->subs[client_id])) + dibs->subs[client_id]->ops->handle_irq(dibs, sba_idx, s_mask); + spin_unlock(&dibs->lock); + + return 0; +} + static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, .query_remote_gid = dibs_lo_query_rgid, + .max_dmbs = dibs_lo_max_dmbs, + .register_dmb = dibs_lo_register_dmb, + .unregister_dmb = dibs_lo_unregister_dmb, + .move_data = dibs_lo_move_data, + .support_mmapped_rdmb = dibs_lo_support_dmb_nocopy, + .attach_dmb = dibs_lo_attach_dmb, + .detach_dmb = dibs_lo_detach_dmb, }; +static void dibs_lo_dev_init(struct dibs_lo_dev *ldev) +{ + rwlock_init(&ldev->dmb_ht_lock); + hash_init(ldev->dmb_ht); + atomic_set(&ldev->dmb_cnt, 0); + init_waitqueue_head(&ldev->ldev_release); +} + +static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) +{ + if (atomic_read(&ldev->dmb_cnt)) + wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); +} + static int dibs_lo_dev_probe(void) { struct dibs_lo_dev *ldev; @@ -56,6 +310,7 @@ static int dibs_lo_dev_probe(void) ldev->dibs = dibs; dibs->drv_priv = ldev; + dibs_lo_dev_init(ldev); uuid_gen(&dibs->gid); dibs->ops = &dibs_lo_ops; @@ -69,6 +324,7 @@ static int dibs_lo_dev_probe(void) return 0; err_reg: + kfree(dibs->dmb_clientid_arr); /* pairs with dibs_dev_alloc() */ put_device(&dibs->dev); kfree(ldev); @@ -82,6 +338,7 @@ static void dibs_lo_dev_remove(void) return; dibs_dev_del(lo_dev->dibs); + dibs_lo_dev_exit(lo_dev); /* pairs with dibs_dev_alloc() */ put_device(&lo_dev->dibs->dev); kfree(lo_dev); diff --git a/drivers/dibs/dibs_loopback.h b/drivers/dibs/dibs_loopback.h index fd03b6333a24..0664f6a8e662 100644 --- a/drivers/dibs/dibs_loopback.h +++ b/drivers/dibs/dibs_loopback.h @@ -13,13 +13,32 @@ #define _DIBS_LOOPBACK_H #include +#include +#include #include #include #if IS_ENABLED(CONFIG_DIBS_LO) +#define DIBS_LO_DMBS_HASH_BITS 12 +#define DIBS_LO_MAX_DMBS 5000 + +struct dibs_lo_dmb_node { + struct hlist_node list; + u64 token; + u32 len; + u32 sba_idx; + void *cpu_addr; + dma_addr_t dma_addr; + refcount_t refcnt; +}; struct dibs_lo_dev { struct dibs_dev *dibs; + atomic_t dmb_cnt; + rwlock_t dmb_ht_lock; + DECLARE_BITMAP(sba_idx_mask, DIBS_LO_MAX_DMBS); + DECLARE_HASHTABLE(dmb_ht, DIBS_LO_DMBS_HASH_BITS); + wait_queue_head_t ldev_release; }; int dibs_loopback_init(void); diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index f20ed0594a51..aacb3ea7825a 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -36,6 +36,16 @@ static struct dibs_dev_list dibs_dev_list = { .mutex = __MUTEX_INITIALIZER(dibs_dev_list.mutex), }; +static void dibs_setup_forwarding(struct dibs_client *client, + struct dibs_dev *dibs) +{ + unsigned long flags; + + spin_lock_irqsave(&dibs->lock, flags); + dibs->subs[client->id] = client; + spin_unlock_irqrestore(&dibs->lock, flags); +} + int dibs_register_client(struct dibs_client *client) { struct dibs_dev *dibs; @@ -60,6 +70,7 @@ int dibs_register_client(struct dibs_client *client) list_for_each_entry(dibs, &dibs_dev_list.list, list) { dibs->priv[i] = NULL; client->ops->add_dev(dibs); + dibs_setup_forwarding(client, dibs); } } mutex_unlock(&dibs_dev_list.mutex); @@ -71,10 +82,25 @@ EXPORT_SYMBOL_GPL(dibs_register_client); int dibs_unregister_client(struct dibs_client *client) { struct dibs_dev *dibs; + unsigned long flags; + int max_dmbs; int rc = 0; mutex_lock(&dibs_dev_list.mutex); list_for_each_entry(dibs, &dibs_dev_list.list, list) { + spin_lock_irqsave(&dibs->lock, flags); + max_dmbs = dibs->ops->max_dmbs(); + for (int i = 0; i < max_dmbs; ++i) { + if (dibs->dmb_clientid_arr[i] == client->id) { + WARN(1, "%s: attempt to unregister '%s' with registered dmb(s)\n", + __func__, client->name); + rc = -EBUSY; + goto err_reg_dmb; + } + } + /* Stop forwarding IRQs */ + dibs->subs[client->id] = NULL; + spin_unlock_irqrestore(&dibs->lock, flags); clients[client->id]->ops->del_dev(dibs); dibs->priv[client->id] = NULL; } @@ -87,6 +113,11 @@ int dibs_unregister_client(struct dibs_client *client) mutex_unlock(&dibs_dev_list.mutex); return rc; + +err_reg_dmb: + spin_unlock_irqrestore(&dibs->lock, flags); + mutex_unlock(&dibs_dev_list.mutex); + return rc; } EXPORT_SYMBOL_GPL(dibs_unregister_client); @@ -150,11 +181,19 @@ static const struct attribute_group dibs_dev_attr_group = { int dibs_dev_add(struct dibs_dev *dibs) { + int max_dmbs; int i, ret; + max_dmbs = dibs->ops->max_dmbs(); + spin_lock_init(&dibs->lock); + dibs->dmb_clientid_arr = kzalloc(max_dmbs, GFP_KERNEL); + if (!dibs->dmb_clientid_arr) + return -ENOMEM; + memset(dibs->dmb_clientid_arr, NO_DIBS_CLIENT, max_dmbs); + ret = device_add(&dibs->dev); if (ret) - return ret; + goto free_client_arr; ret = sysfs_create_group(&dibs->dev.kobj, &dibs_dev_attr_group); if (ret) { @@ -164,8 +203,10 @@ int dibs_dev_add(struct dibs_dev *dibs) mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { - if (clients[i]) + if (clients[i]) { clients[i]->ops->add_dev(dibs); + dibs_setup_forwarding(clients[i], dibs); + } } mutex_unlock(&clients_lock); list_add(&dibs->list, &dibs_dev_list.list); @@ -175,6 +216,8 @@ int dibs_dev_add(struct dibs_dev *dibs) err_device_del: device_del(&dibs->dev); +free_client_arr: + kfree(dibs->dmb_clientid_arr); return ret; } @@ -182,8 +225,16 @@ EXPORT_SYMBOL_GPL(dibs_dev_add); void dibs_dev_del(struct dibs_dev *dibs) { + unsigned long flags; int i; + sysfs_remove_group(&dibs->dev.kobj, &dibs_dev_attr_group); + + spin_lock_irqsave(&dibs->lock, flags); + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) + dibs->subs[i] = NULL; + spin_unlock_irqrestore(&dibs->lock, flags); + mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { @@ -195,6 +246,7 @@ void dibs_dev_del(struct dibs_dev *dibs) mutex_unlock(&dibs_dev_list.mutex); device_del(&dibs->dev); + kfree(dibs->dmb_clientid_arr); } EXPORT_SYMBOL_GPL(dibs_dev_del); diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 121b3a2be760..346d1ea8650b 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -98,14 +98,6 @@ int ism_unregister_client(struct ism_client *client) spin_lock_irqsave(&ism->lock, flags); /* Stop forwarding IRQs and events */ ism->subs[client->id] = NULL; - for (int i = 0; i < ISM_NR_DMBS; ++i) { - if (ism->sba_client_arr[i] == client->id) { - WARN(1, "%s: attempt to unregister '%s' with registered dmb(s)\n", - __func__, client->name); - rc = -EBUSY; - goto err_reg_dmb; - } - } spin_unlock_irqrestore(&ism->lock, flags); } mutex_unlock(&ism_dev_list.mutex); @@ -116,11 +108,6 @@ int ism_unregister_client(struct ism_client *client) max_client--; mutex_unlock(&clients_lock); return rc; - -err_reg_dmb: - spin_unlock_irqrestore(&ism->lock, flags); - mutex_unlock(&ism_dev_list.mutex); - return rc; } EXPORT_SYMBOL_GPL(ism_unregister_client); @@ -308,15 +295,20 @@ static int ism_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, return ism_cmd(ism, &cmd); } -static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_max_dmbs(void) +{ + return ISM_NR_DMBS; +} + +static void ism_free_dmb(struct ism_dev *ism, struct dibs_dmb *dmb) { - clear_bit(dmb->sba_idx, ism->sba_bitmap); + clear_bit(dmb->idx, ism->sba_bitmap); dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, DMA_FROM_DEVICE); folio_put(virt_to_folio(dmb->cpu_addr)); } -static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_alloc_dmb(struct ism_dev *ism, struct dibs_dmb *dmb) { struct folio *folio; unsigned long bit; @@ -325,16 +317,16 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; - if (!dmb->sba_idx) { + if (!dmb->idx) { bit = find_next_zero_bit(ism->sba_bitmap, ISM_NR_DMBS, ISM_DMB_BIT_OFFSET); if (bit == ISM_NR_DMBS) return -ENOSPC; - dmb->sba_idx = bit; + dmb->idx = bit; } - if (dmb->sba_idx < ISM_DMB_BIT_OFFSET || - test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) + if (dmb->idx < ISM_DMB_BIT_OFFSET || + test_and_set_bit(dmb->idx, ism->sba_bitmap)) return -EINVAL; folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | @@ -359,13 +351,14 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) out_free: kfree(dmb->cpu_addr); out_bit: - clear_bit(dmb->sba_idx, ism->sba_bitmap); + clear_bit(dmb->idx, ism->sba_bitmap); return rc; } -int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, - struct ism_client *client) +static int ism_register_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb, + struct dibs_client *client) { + struct ism_dev *ism = dibs->drv_priv; union ism_reg_dmb cmd; unsigned long flags; int ret; @@ -380,10 +373,10 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, cmd.request.dmb = dmb->dma_addr; cmd.request.dmb_len = dmb->dmb_len; - cmd.request.sba_idx = dmb->sba_idx; + cmd.request.sba_idx = dmb->idx; cmd.request.vlan_valid = dmb->vlan_valid; cmd.request.vlan_id = dmb->vlan_id; - cmd.request.rgid = dmb->rgid; + memcpy(&cmd.request.rgid, &dmb->rgid, sizeof(u64)); ret = ism_cmd(ism, &cmd); if (ret) { @@ -391,16 +384,16 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, goto out; } dmb->dmb_tok = cmd.response.dmb_tok; - spin_lock_irqsave(&ism->lock, flags); - ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = client->id; - spin_unlock_irqrestore(&ism->lock, flags); + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb->idx - ISM_DMB_BIT_OFFSET] = client->id; + spin_unlock_irqrestore(&dibs->lock, flags); out: return ret; } -EXPORT_SYMBOL_GPL(ism_register_dmb); -int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_unregister_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) { + struct ism_dev *ism = dibs->drv_priv; union ism_unreg_dmb cmd; unsigned long flags; int ret; @@ -411,9 +404,9 @@ int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) cmd.request.dmb_tok = dmb->dmb_tok; - spin_lock_irqsave(&ism->lock, flags); - ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = NO_CLIENT; - spin_unlock_irqrestore(&ism->lock, flags); + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb->idx - ISM_DMB_BIT_OFFSET] = NO_DIBS_CLIENT; + spin_unlock_irqrestore(&dibs->lock, flags); ret = ism_cmd(ism, &cmd); if (ret && ret != ISM_ERROR) @@ -423,7 +416,6 @@ int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) out: return ret; } -EXPORT_SYMBOL_GPL(ism_unregister_dmb); static int ism_add_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { @@ -459,9 +451,11 @@ static unsigned int max_bytes(unsigned int start, unsigned int len, return min(boundary - (start & (boundary - 1)), len); } -int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, - unsigned int offset, void *data, unsigned int size) +static int ism_move(struct dibs_dev *dibs, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size) { + struct ism_dev *ism = dibs->drv_priv; unsigned int bytes; u64 dmb_req; int ret; @@ -482,7 +476,6 @@ int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, return 0; } -EXPORT_SYMBOL_GPL(ism_move); static u16 ism_get_chid(struct dibs_dev *dibs) { @@ -518,14 +511,17 @@ static irqreturn_t ism_handle_irq(int irq, void *data) { struct ism_dev *ism = data; unsigned long bit, end; + struct dibs_dev *dibs; unsigned long *bv; u16 dmbemask; u8 client_id; + dibs = ism->dibs; + bv = (void *) &ism->sba->dmb_bits[ISM_DMB_WORD_OFFSET]; end = sizeof(ism->sba->dmb_bits) * BITS_PER_BYTE - ISM_DMB_BIT_OFFSET; - spin_lock(&ism->lock); + spin_lock(&dibs->lock); ism->sba->s = 0; barrier(); for (bit = 0;;) { @@ -537,10 +533,13 @@ static irqreturn_t ism_handle_irq(int irq, void *data) dmbemask = ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET]; ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET] = 0; barrier(); - client_id = ism->sba_client_arr[bit]; - if (unlikely(client_id == NO_CLIENT || !ism->subs[client_id])) + client_id = dibs->dmb_clientid_arr[bit]; + if (unlikely(client_id == NO_DIBS_CLIENT || + !dibs->subs[client_id])) continue; - ism->subs[client_id]->handle_irq(ism, bit + ISM_DMB_BIT_OFFSET, dmbemask); + dibs->subs[client_id]->ops->handle_irq(dibs, + bit + ISM_DMB_BIT_OFFSET, + dmbemask); } if (ism->sba->e) { @@ -548,13 +547,17 @@ static irqreturn_t ism_handle_irq(int irq, void *data) barrier(); ism_handle_event(ism); } - spin_unlock(&ism->lock); + spin_unlock(&dibs->lock); return IRQ_HANDLED; } static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, .query_remote_gid = ism_query_rgid, + .max_dmbs = ism_max_dmbs, + .register_dmb = ism_register_dmb, + .unregister_dmb = ism_unregister_dmb, + .move_data = ism_move, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, }; @@ -568,15 +571,10 @@ static int ism_dev_init(struct ism_dev *ism) if (ret <= 0) goto out; - ism->sba_client_arr = kzalloc(ISM_NR_DMBS, GFP_KERNEL); - if (!ism->sba_client_arr) - goto free_vectors; - memset(ism->sba_client_arr, NO_CLIENT, ISM_NR_DMBS); - ret = request_irq(pci_irq_vector(pdev, 0), ism_handle_irq, 0, pci_name(pdev), ism); if (ret) - goto free_client_arr; + goto free_vectors; ret = register_sba(ism); if (ret) @@ -605,8 +603,6 @@ unreg_sba: unregister_sba(ism); free_irq: free_irq(pci_irq_vector(pdev, 0), ism); -free_client_arr: - kfree(ism->sba_client_arr); free_vectors: pci_free_irq_vectors(pdev); out: @@ -629,7 +625,6 @@ static void ism_dev_exit(struct ism_dev *ism) unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); - kfree(ism->sba_client_arr); pci_free_irq_vectors(pdev); list_del_init(&ism->list); mutex_unlock(&ism_dev_list.mutex); @@ -677,6 +672,9 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) dibs->drv_priv = ism; dibs->ops = &ism_ops; + /* enable ism device, but any interrupts and events will be ignored + * before dibs_dev_add() adds it to any clients. + */ ret = ism_dev_init(ism); if (ret) goto err_dibs; @@ -766,17 +764,6 @@ module_exit(ism_exit); /*************************** SMC-D Implementation *****************************/ #if IS_ENABLED(CONFIG_SMC) -static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, - void *client) -{ - return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, client); -} - -static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); -} - static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, u32 event_code, u64 info) { @@ -801,18 +788,8 @@ static int smcd_signal_ieq(struct smcd_dev *smcd, struct smcd_gid *rgid, trigger_irq, event_code, info); } -static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size) -{ - return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size); -} - static const struct smcd_ops ism_smcd_ops = { - .register_dmb = smcd_register_dmb, - .unregister_dmb = smcd_unregister_dmb, .signal_event = smcd_signal_ieq, - .move_data = smcd_move, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index c75a40fe3039..be009c614205 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -36,12 +36,44 @@ * clients. */ +/* DMB - Direct Memory Buffer + * -------------------------- + * A dibs client provides a dmb as input buffer for a local receiving + * dibs device for exactly one (remote) sending dibs device. Only this + * sending device can send data into this dmb using move_data(). Sender + * and receiver can be the same device. A dmb belongs to exactly one client. + */ +struct dibs_dmb { + /* tok - Token for this dmb + * Used by remote and local devices and clients to address this dmb. + * Provided by dibs fabric. Unique per dibs fabric. + */ + u64 dmb_tok; + /* rgid - GID of designated remote sending device */ + uuid_t rgid; + /* cpu_addr - buffer address */ + void *cpu_addr; + /* len - buffer length */ + u32 dmb_len; + /* idx - Index of this DMB on this receiving device */ + u32 idx; + /* VLAN support (deprecated) + * In order to write into a vlan-tagged dmb, the remote device needs + * to belong to the this vlan + */ + u32 vlan_valid; + u32 vlan_id; + /* optional, used by device driver */ + dma_addr_t dma_addr; +}; + struct dibs_dev; /* DIBS client * ----------- */ #define MAX_DIBS_CLIENTS 8 +#define NO_DIBS_CLIENT 0xff /* All dibs clients have access to all dibs devices. * A dibs client provides the following functions to be called by dibs layer or * dibs device drivers: @@ -69,6 +101,22 @@ struct dibs_client_ops { * The device is no longer usable by this client after this call. */ void (*del_dev)(struct dibs_dev *dev); + /** + * handle_irq() - Handle signaling for a DMB + * @dev: device that owns the dmb + * @idx: Index of the dmb that got signalled + * @dmbemask: signaling mask of the dmb + * + * Handle signaling for a dmb that was registered by this client + * for this device. + * The dibs device can coalesce multiple signaling triggers into a + * single call of handle_irq(). dmbemask can be used to indicate + * different kinds of triggers. + * + * Context: Called in IRQ context by dibs device driver + */ + void (*handle_irq)(struct dibs_dev *dev, unsigned int idx, + u16 dmbemask); }; struct dibs_client { @@ -147,6 +195,77 @@ struct dibs_dev_ops { */ int (*query_remote_gid)(struct dibs_dev *dev, const uuid_t *rgid, u32 vid_valid, u32 vid); + /** + * max_dmbs() + * Return: Max number of DMBs that can be registered for this kind of + * dibs_dev + */ + int (*max_dmbs)(void); + /** + * register_dmb() - allocate and register a dmb + * @dev: dibs device + * @dmb: dmb struct to be registered + * @client: dibs client + * @vid: VLAN id; deprecated, ignored if device does not support vlan + * + * The following fields of dmb must provide valid input: + * @rgid: gid of remote user device + * @dmb_len: buffer length + * @idx: Optionally:requested idx (if non-zero) + * @vlan_valid: if zero, vlan_id will be ignored; + * deprecated, ignored if device does not support vlan + * @vlan_id: deprecated, ignored if device does not support vlan + * Upon return in addition the following fields will be valid: + * @dmb_tok: for usage by remote and local devices and clients + * @cpu_addr: allocated buffer + * @idx: dmb index, unique per dibs device + * @dma_addr: to be used by device driver,if applicable + * + * Allocate a dmb buffer and register it with this device and for this + * client. + * Return: zero on success + */ + int (*register_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb, + struct dibs_client *client); + /** + * unregister_dmb() - unregister and free a dmb + * @dev: dibs device + * @dmb: dmb struct to be unregistered + * The following fields of dmb must provide valid input: + * @dmb_tok + * @cpu_addr + * @idx + * + * Free dmb.cpu_addr and unregister the dmb from this device. + * Return: zero on success + */ + int (*unregister_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb); + /** + * move_data() - write into a remote dmb + * @dev: Local sending dibs device + * @dmb_tok: Token of the remote dmb + * @idx: signaling index in dmbemask + * @sf: signaling flag; + * if true, idx will be turned on at target dmbemask mask + * and target device will be signaled. + * @offset: offset within target dmb + * @data: pointer to data to be sent + * @size: length of data to be sent, can be zero. + * + * Use dev to write data of size at offset into a remote dmb + * identified by dmb_tok. Data is moved synchronously, *data can + * be freed when this function returns. + * + * If signaling flag (sf) is true, bit number idx bit will be turned + * on in the dmbemask mask when handle_irq() is called at the remote + * dibs client that owns the target dmb. The target device may chose + * to coalesce the signaling triggers of multiple move_data() calls + * to the same target dmb into a single handle_irq() call. + * Return: zero on success + */ + int (*move_data)(struct dibs_dev *dev, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size); /** * add_vlan_id() - add dibs device to vlan (optional, deprecated) * @dev: dibs device @@ -166,6 +285,55 @@ struct dibs_dev_ops { * Return: zero on success */ int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * support_mmapped_rdmb() - can this device provide memory mapped + * remote dmbs? (optional) + * @dev: dibs device + * + * A dibs device can provide a kernel address + length, that represent + * a remote target dmb (like MMIO). Alternatively to calling + * move_data(), a dibs client can write into such a ghost-send-buffer + * (= to this kernel address) and the data will automatically + * immediately appear in the target dmb, even without calling + * move_data(). + * + * Either all 3 function pointers for support_dmb_nocopy(), + * attach_dmb() and detach_dmb() are defined, or all of them must + * be NULL. + * + * Return: non-zero, if memory mapped remote dmbs are supported. + */ + int (*support_mmapped_rdmb)(struct dibs_dev *dev); + /** + * attach_dmb() - attach local memory to a remote dmb + * @dev: Local sending ism device + * @dmb: all other parameters are passed in the form of a + * dmb struct + * TODO: (THIS IS CONFUSING, should be changed) + * dmb_tok: (in) Token of the remote dmb, we want to attach to + * cpu_addr: (out) MMIO address + * dma_addr: (out) MMIO address (if applicable, invalid otherwise) + * dmb_len: (out) length of local MMIO region, + * equal to length of remote DMB. + * sba_idx: (out) index of remote dmb (NOT HELPFUL, should be removed) + * + * Provides a memory address to the sender that can be used to + * directly write into the remote dmb. + * Memory is available until detach_dmb is called + * + * Return: Zero upon success, Error code otherwise + */ + int (*attach_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb); + /** + * detach_dmb() - Detach the ghost buffer from a remote dmb + * @dev: ism device + * @token: dmb token of the remote dmb + * + * No need to free cpu_addr. + * + * Return: Zero upon success, Error code otherwise + */ + int (*detach_dmb)(struct dibs_dev *dev, u64 token); }; struct dibs_dev { @@ -179,6 +347,15 @@ struct dibs_dev { /* priv pointer per client; for client usage only */ void *priv[MAX_DIBS_CLIENTS]; + + /* get this lock before accessing any of the fields below */ + spinlock_t lock; + /* array of client ids indexed by dmb idx; + * can be used as indices into priv and subs arrays + */ + u8 *dmb_clientid_arr; + /* Sparse array of all ISM clients */ + struct dibs_client *subs[MAX_DIBS_CLIENTS]; }; static inline void dibs_set_priv(struct dibs_dev *dev, diff --git a/include/linux/ism.h b/include/linux/ism.h index a926dd61b5a1..b7feb4dcd5a8 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -11,17 +11,6 @@ #include -struct ism_dmb { - u64 dmb_tok; - u64 rgid; - u32 dmb_len; - u32 sba_idx; - u32 vlan_valid; - u32 vlan_id; - void *cpu_addr; - dma_addr_t dma_addr; -}; - /* Unless we gain unexpected popularity, this limit should hold for a while */ #define MAX_CLIENTS 8 #define ISM_NR_DMBS 1920 @@ -36,7 +25,6 @@ struct ism_dev { struct ism_sba *sba; dma_addr_t sba_dma_addr; DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); - u8 *sba_client_arr; /* entries are indices into 'clients' array */ void *priv[MAX_CLIENTS]; struct ism_eq *ieq; @@ -58,11 +46,6 @@ struct ism_event { struct ism_client { const char *name; void (*handle_event)(struct ism_dev *dev, struct ism_event *event); - /* Parameter dmbemask contains a bit vector with updated DMBEs, if sent - * via ism_move_data(). Callback function must handle all active bits - * indicated by dmbemask. - */ - void (*handle_irq)(struct ism_dev *dev, unsigned int bit, u16 dmbemask); /* Private area - don't touch! */ u8 id; }; @@ -79,12 +62,6 @@ static inline void ism_set_priv(struct ism_dev *dev, struct ism_client *client, dev->priv[client->id] = priv; } -int ism_register_dmb(struct ism_dev *dev, struct ism_dmb *dmb, - struct ism_client *client); -int ism_unregister_dmb(struct ism_dev *dev, struct ism_dmb *dmb); -int ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, - unsigned int offset, void *data, unsigned int size); - const struct smcd_ops *ism_get_smcd_ops(void); #endif /* _ISM_H */ diff --git a/include/net/smc.h b/include/net/smc.h index 5bd135fb4d49..8e3debcf7db5 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -28,17 +28,6 @@ struct smc_hashinfo { }; /* SMCD/ISM device driver interface */ -struct smcd_dmb { - u64 dmb_tok; - u64 rgid; - u32 dmb_len; - u32 sba_idx; - u32 vlan_valid; - u32 vlan_id; - void *cpu_addr; - dma_addr_t dma_addr; -}; - #define ISM_EVENT_DMB 0 #define ISM_EVENT_GID 1 #define ISM_EVENT_SWR 2 @@ -53,25 +42,14 @@ struct smcd_gid { }; struct smcd_ops { - int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, - void *client); - int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size); - /* optional operations */ int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, u32 trigger_irq, u32 event_code, u64 info); - int (*support_dmb_nocopy)(struct smcd_dev *dev); - int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*detach_dmb)(struct smcd_dev *dev, u64 token); }; struct smcd_dev { const struct smcd_ops *ops; void *priv; - void *client; struct dibs_dev *dibs; struct list_head list; spinlock_t lock; diff --git a/net/smc/Makefile b/net/smc/Makefile index 96ccfdf246df..0e754cbc38f9 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,4 +6,3 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o -smc-y += smc_loopback.o diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index d20d00b46825..01e49371d23d 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -15,7 +15,6 @@ #include "smc.h" #include "smc_core.h" #include "smc_ism.h" -#include "smc_loopback.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" @@ -33,18 +32,19 @@ static void smcd_register_dev(struct dibs_dev *dibs); static void smcd_unregister_dev(struct dibs_dev *dibs); #if IS_ENABLED(CONFIG_ISM) static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, - u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", .handle_event = smcd_handle_event, - .handle_irq = smcd_handle_irq, }; #endif +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, + u16 dmbemask); + static struct dibs_client_ops smc_client_ops = { .add_dev = smcd_register_dev, .del_dev = smcd_unregister_dev, + .handle_irq = smcd_handle_irq, }; static struct dibs_client smc_dibs_client = { @@ -221,18 +221,19 @@ out: void smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dmb dmb; if (!dmb_desc->dma_addr) return; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - smcd->ops->unregister_dmb(smcd, &dmb); + + smcd->dibs->ops->unregister_dmb(smcd->dibs, &dmb); return; } @@ -240,17 +241,20 @@ void smc_ism_unregister_dmb(struct smcd_dev *smcd, int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dev *dibs; + struct dibs_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; - dmb.rgid = lgr->peer_gid.gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); + copy_to_dibsgid(&dmb.rgid, &lgr->peer_gid); + + dibs = lgr->smcd->dibs; + rc = dibs->ops->register_dmb(dibs, &dmb, &smc_dibs_client); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; @@ -265,24 +269,24 @@ bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) * merging sndbuf with peer DMB to avoid * data copies between them. */ - return (smcd->ops->support_dmb_nocopy && - smcd->ops->support_dmb_nocopy(smcd)); + return (smcd->dibs->ops->support_mmapped_rdmb && + smcd->dibs->ops->support_mmapped_rdmb(smcd->dibs)); } int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dmb dmb; int rc = 0; - if (!dev->ops->attach_dmb) + if (!dev->dibs->ops->attach_dmb) return -EINVAL; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = token; - rc = dev->ops->attach_dmb(dev, &dmb); + rc = dev->dibs->ops->attach_dmb(dev->dibs, &dmb); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; @@ -294,10 +298,10 @@ int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) { - if (!dev->ops->detach_dmb) + if (!dev->dibs->ops->detach_dmb) return -EINVAL; - return dev->ops->detach_dmb(dev, token); + return dev->dibs->ops->detach_dmb(dev->dibs, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -503,26 +507,20 @@ static void smcd_register_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd, *fentry; const struct smcd_ops *ops; - struct smc_lo_dev *smc_lo; struct ism_dev *ism; + int max_dmbs; - if (smc_ism_is_loopback(dibs)) { - if (smc_loopback_init(&smc_lo)) - return; - } + max_dmbs = dibs->ops->max_dmbs(); if (smc_ism_is_loopback(dibs)) { - ops = smc_lo_get_smcd_ops(); - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, - SMC_LO_MAX_DMBS); + ops = NULL; } else { ism = dibs->drv_priv; #if IS_ENABLED(CONFIG_ISM) ops = ism_get_smcd_ops(); #endif - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, - ISM_NR_DMBS); } + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, max_dmbs); if (!smcd) return; @@ -530,13 +528,11 @@ static void smcd_register_dev(struct dibs_dev *dibs) dibs_set_priv(dibs, &smc_dibs_client, smcd); if (smc_ism_is_loopback(dibs)) { - smcd->priv = smc_lo; - smc_lo->smcd = smcd; + smcd->priv = NULL; } else { smcd->priv = ism; #if IS_ENABLED(CONFIG_ISM) ism_set_priv(ism, &smc_ism_client, smcd); - smcd->client = &smc_ism_client; #endif } @@ -590,8 +586,6 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); - if (smc_ism_is_loopback(dibs)) - smc_loopback_exit(); kfree(smcd->conn); kfree(smcd); } @@ -624,6 +618,7 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } +#endif /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. @@ -632,10 +627,10 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_connection *conn = NULL; unsigned long flags; @@ -645,7 +640,6 @@ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } -#endif int smc_ism_signal_shutdown(struct smc_link_group *lgr) { diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 139e99da2c9f..a1575e31df73 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -69,7 +69,9 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, { int rc; - rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len); + rc = smcd->dibs->ops->move_data(smcd->dibs, dmb_tok, idx, sf, offset, + data, len); + return rc < 0 ? rc : 0; } diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c deleted file mode 100644 index 52cba01cb209..000000000000 --- a/net/smc/smc_loopback.c +++ /dev/null @@ -1,294 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * Functions for loopback-ism device. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu - * Tony Lu - * - */ - -#include -#include -#include -#include - -#include "smc_cdc.h" -#include "smc_ism.h" -#include "smc_loopback.h" - -#define SMC_LO_SUPPORT_NOCOPY 0x1 -#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) - -static struct smc_lo_dev *lo_dev; - -static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, - void *client_priv) -{ - struct smc_lo_dmb_node *dmb_node, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - int sba_idx, rc; - - /* check space for new dmb */ - for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) { - if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) - break; - } - if (sba_idx == SMC_LO_MAX_DMBS) - return -ENOSPC; - - dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); - if (!dmb_node) { - rc = -ENOMEM; - goto err_bit; - } - - dmb_node->sba_idx = sba_idx; - dmb_node->len = dmb->dmb_len; - dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | - __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC); - if (!dmb_node->cpu_addr) { - rc = -ENOMEM; - goto err_node; - } - dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; - refcount_set(&dmb_node->refcnt, 1); - -again: - /* add new dmb into hash table */ - get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); - write_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { - if (tmp_node->token == dmb_node->token) { - write_unlock_bh(&ldev->dmb_ht_lock); - goto again; - } - } - hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); - write_unlock_bh(&ldev->dmb_ht_lock); - atomic_inc(&ldev->dmb_cnt); - - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - - return 0; - -err_node: - kfree(dmb_node); -err_bit: - clear_bit(sba_idx, ldev->sba_idx_mask); - return rc; -} - -static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, - struct smc_lo_dmb_node *dmb_node) -{ - /* remove dmb from hash table */ - write_lock_bh(&ldev->dmb_ht_lock); - hash_del(&dmb_node->list); - write_unlock_bh(&ldev->dmb_ht_lock); - - clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); - kvfree(dmb_node->cpu_addr); - kfree(dmb_node); - - if (atomic_dec_and_test(&ldev->dmb_cnt)) - wake_up(&ldev->ldev_release); -} - -static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb from hash table */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd) -{ - return SMC_LO_SUPPORT_NOCOPY; -} - -static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (!refcount_inc_not_zero(&dmb_node->refcnt)) - /* the dmb is being unregistered, but has - * not been removed from the hash table. - */ - return -EINVAL; - - /* provide dmb information */ - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - return 0; -} - -static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { - if (tmp_node->token == token) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, - unsigned int idx, bool sf, unsigned int offset, - void *data, unsigned int size) -{ - struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - struct smc_connection *conn; - - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { - if (tmp_node->token == dmb_tok) { - rmb_node = tmp_node; - break; - } - } - if (!rmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - memcpy((char *)rmb_node->cpu_addr + offset, data, size); - read_unlock_bh(&ldev->dmb_ht_lock); - - if (!sf) - return 0; - - conn = smcd->conn[rmb_node->sba_idx]; - if (!conn || conn->killed) - return -EPIPE; - tasklet_schedule(&conn->rx_tsklet); - return 0; -} - -static const struct smcd_ops lo_ops = { - .register_dmb = smc_lo_register_dmb, - .unregister_dmb = smc_lo_unregister_dmb, - .support_dmb_nocopy = smc_lo_support_dmb_nocopy, - .attach_dmb = smc_lo_attach_dmb, - .detach_dmb = smc_lo_detach_dmb, - .signal_event = NULL, - .move_data = smc_lo_move_data, -}; - -const struct smcd_ops *smc_lo_get_smcd_ops(void) -{ - return &lo_ops; -} - -static void smc_lo_dev_init(struct smc_lo_dev *ldev) -{ - rwlock_init(&ldev->dmb_ht_lock); - hash_init(ldev->dmb_ht); - atomic_set(&ldev->dmb_cnt, 0); - init_waitqueue_head(&ldev->ldev_release); - - return; -} - -static void smc_lo_dev_exit(struct smc_lo_dev *ldev) -{ - if (atomic_read(&ldev->dmb_cnt)) - wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); -} - -static int smc_lo_dev_probe(void) -{ - struct smc_lo_dev *ldev; - - ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); - if (!ldev) - return -ENOMEM; - - smc_lo_dev_init(ldev); - - lo_dev = ldev; /* global loopback device */ - - return 0; -} - -static void smc_lo_dev_remove(void) -{ - if (!lo_dev) - return; - - smc_lo_dev_exit(lo_dev); - kfree(lo_dev); - lo_dev = NULL; -} - -int smc_loopback_init(struct smc_lo_dev **smc_lb) -{ - int ret; - - ret = smc_lo_dev_probe(); - if (!ret) - *smc_lb = lo_dev; - return ret; -} - -void smc_loopback_exit(void) -{ - smc_lo_dev_remove(); -} diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h deleted file mode 100644 index 33bb96ec8b77..000000000000 --- a/net/smc/smc_loopback.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * SMC-D loopback-ism device structure definitions. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu - * Tony Lu - * - */ - -#ifndef _SMC_LOOPBACK_H -#define _SMC_LOOPBACK_H - -#include -#include - -#define SMC_LO_MAX_DMBS 5000 -#define SMC_LO_DMBS_HASH_BITS 12 - -struct smc_lo_dmb_node { - struct hlist_node list; - u64 token; - u32 len; - u32 sba_idx; - void *cpu_addr; - dma_addr_t dma_addr; - refcount_t refcnt; -}; - -struct smc_lo_dev { - struct smcd_dev *smcd; - atomic_t dmb_cnt; - rwlock_t dmb_ht_lock; - DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); - DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS); - wait_queue_head_t ldev_release; -}; - -const struct smcd_ops *smc_lo_get_smcd_ops(void); - -int smc_loopback_init(struct smc_lo_dev **smc_lb); -void smc_loopback_exit(void); - -#endif /* _SMC_LOOPBACK_H */ -- cgit v1.2.3 From a612dbe8d04d47af91fa88f0599c1370cc70f687 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Thu, 18 Sep 2025 13:05:00 +0200 Subject: dibs: Move event handling to dibs layer Add defines for all event types and subtypes an ism device is known to produce as it can be helpful for debugging purposes. Introduces a generic 'struct dibs_event' and adopt ism device driver and smc-d client accordingly. Tolerate and ignore other type and subtype values to enable future device extensions. SMC-D and ISM are now independent. struct ism_dev can be moved to drivers/s390/net/ism.h. Note that in smc, the term 'ism' is still used. Future patches could replace that with 'dibs' or 'smc-d' as appropriate. Signed-off-by: Julian Ruess Co-developed-by: Alexandra Winter Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-15-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- MAINTAINERS | 2 - arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + drivers/dibs/dibs_main.c | 2 +- drivers/s390/net/Kconfig | 1 - drivers/s390/net/ism.h | 42 +++++++- drivers/s390/net/ism_drv.c | 221 ++++++++++++-------------------------- include/linux/dibs.h | 62 +++++++++++ include/net/smc.h | 15 --- net/smc/Kconfig | 1 - net/smc/smc_ism.c | 99 ++++++----------- 11 files changed, 208 insertions(+), 239 deletions(-) (limited to 'include/net') diff --git a/MAINTAINERS b/MAINTAINERS index ecc55fae5f9d..1fbe541198f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17580,7 +17580,6 @@ F: include/linux/fddidevice.h F: include/linux/hippidevice.h F: include/linux/if_* F: include/linux/inetdevice.h -F: include/linux/ism.h F: include/linux/netdev* F: include/linux/platform_data/wiznet.h F: include/uapi/linux/cn_proc.h @@ -22237,7 +22236,6 @@ L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported F: drivers/s390/net/ -F: include/linux/ism.h S390 PCI SUBSYSTEM M: Niklas Schnelle diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index a97c8d19f643..fdde8ee0d7bd 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -122,6 +122,7 @@ CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y +CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 7f7b52d9a33c..bf9e7dbd4a89 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -113,6 +113,7 @@ CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y +CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index aacb3ea7825a..5425238d5a42 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -98,7 +98,7 @@ int dibs_unregister_client(struct dibs_client *client) goto err_reg_dmb; } } - /* Stop forwarding IRQs */ + /* Stop forwarding IRQs and events */ dibs->subs[client->id] = NULL; spin_unlock_irqrestore(&dibs->lock, flags); clients[client->id]->ops->del_dev(dibs); diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index 92985f595d59..0fd700c5745a 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -82,7 +82,6 @@ config CCWGROUP config ISM tristate "Support for ISM vPCI Adapter" depends on PCI && DIBS - imply SMC default n help Select this option if you want to use the Internal Shared Memory diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 1b9fa14da20c..08d17956cb36 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -6,13 +6,13 @@ #include #include #include -#include -#include #include #define UTIL_STR_LEN 16 #define ISM_ERROR 0xFFFF +#define ISM_NR_DMBS 1920 + /* * Do not use the first word of the DMB bits to ensure 8 byte aligned access. */ @@ -34,6 +34,23 @@ #define ISM_UNREG_SBA 0x11 #define ISM_UNREG_IEQ 0x12 +enum ism_event_type { + ISM_EVENT_BUF = 0x00, + ISM_EVENT_DEV = 0x01, + ISM_EVENT_SWR = 0x02 +}; + +enum ism_event_code { + ISM_BUF_DMB_UNREGISTERED = 0x04, + ISM_BUF_USING_ISM_DEV_DISABLED = 0x08, + ISM_BUF_OWNING_ISM_DEV_IN_ERR_STATE = 0x02, + ISM_BUF_USING_ISM_DEV_IN_ERR_STATE = 0x03, + ISM_BUF_VLAN_MISMATCH_WITH_OWNER = 0x05, + ISM_BUF_VLAN_MISMATCH_WITH_USER = 0x06, + ISM_DEV_GID_DISABLED = 0x07, + ISM_DEV_GID_ERR_STATE = 0x01 +}; + struct ism_req_hdr { u32 cmd; u16 : 16; @@ -185,6 +202,14 @@ struct ism_eq_header { u64 : 64; }; +struct ism_event { + u32 type; + u32 code; + u64 tok; + u64 time; + u64 info; +}; + struct ism_eq { struct ism_eq_header header; struct ism_event entry[15]; @@ -199,6 +224,19 @@ struct ism_sba { u16 dmbe_mask[ISM_NR_DMBS]; }; +struct ism_dev { + spinlock_t cmd_lock; /* serializes cmds */ + struct dibs_dev *dibs; + struct pci_dev *pdev; + struct ism_sba *sba; + dma_addr_t sba_dma_addr; + DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); + + struct ism_eq *ieq; + dma_addr_t ieq_dma_addr; + int ieq_idx; +}; + #define ISM_CREATE_REQ(dmb, idx, sf, offset) \ ((dmb) | (idx) << 24 | (sf) << 23 | (offset)) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 346d1ea8650b..f84aa2e676e9 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -31,86 +31,6 @@ MODULE_DEVICE_TABLE(pci, ism_device_table); static debug_info_t *ism_debug_info; -#define NO_CLIENT 0xff /* must be >= MAX_CLIENTS */ -static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ - /* a list for fast mapping */ -static u8 max_client; -static DEFINE_MUTEX(clients_lock); -struct ism_dev_list { - struct list_head list; - struct mutex mutex; /* protects ism device list */ -}; - -static struct ism_dev_list ism_dev_list = { - .list = LIST_HEAD_INIT(ism_dev_list.list), - .mutex = __MUTEX_INITIALIZER(ism_dev_list.mutex), -}; - -static void ism_setup_forwarding(struct ism_client *client, struct ism_dev *ism) -{ - unsigned long flags; - - spin_lock_irqsave(&ism->lock, flags); - ism->subs[client->id] = client; - spin_unlock_irqrestore(&ism->lock, flags); -} - -int ism_register_client(struct ism_client *client) -{ - struct ism_dev *ism; - int i, rc = -ENOSPC; - - mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < MAX_CLIENTS; ++i) { - if (!clients[i]) { - clients[i] = client; - client->id = i; - if (i == max_client) - max_client++; - rc = 0; - break; - } - } - mutex_unlock(&clients_lock); - - if (i < MAX_CLIENTS) { - /* initialize with all devices that we got so far */ - list_for_each_entry(ism, &ism_dev_list.list, list) { - ism->priv[i] = NULL; - ism_setup_forwarding(client, ism); - } - } - mutex_unlock(&ism_dev_list.mutex); - - return rc; -} -EXPORT_SYMBOL_GPL(ism_register_client); - -int ism_unregister_client(struct ism_client *client) -{ - struct ism_dev *ism; - unsigned long flags; - int rc = 0; - - mutex_lock(&ism_dev_list.mutex); - list_for_each_entry(ism, &ism_dev_list.list, list) { - spin_lock_irqsave(&ism->lock, flags); - /* Stop forwarding IRQs and events */ - ism->subs[client->id] = NULL; - spin_unlock_irqrestore(&ism->lock, flags); - } - mutex_unlock(&ism_dev_list.mutex); - - mutex_lock(&clients_lock); - clients[client->id] = NULL; - if (client->id + 1 == max_client) - max_client--; - mutex_unlock(&clients_lock); - return rc; -} -EXPORT_SYMBOL_GPL(ism_unregister_client); - static int ism_cmd(struct ism_dev *ism, void *cmd) { struct ism_req_hdr *req = cmd; @@ -445,6 +365,24 @@ static int ism_del_vlan_id(struct dibs_dev *dibs, u64 vlan_id) return ism_cmd(ism, &cmd); } +static int ism_signal_ieq(struct dibs_dev *dibs, const uuid_t *rgid, + u32 trigger_irq, u32 event_code, u64 info) +{ + struct ism_dev *ism = dibs->drv_priv; + union ism_sig_ieq cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; + cmd.request.hdr.len = sizeof(cmd.request); + + memcpy(&cmd.request.rgid, rgid, sizeof(cmd.request.rgid)); + cmd.request.trigger_irq = trigger_irq; + cmd.request.event_code = event_code; + cmd.request.info = info; + + return ism_cmd(ism, &cmd); +} + static unsigned int max_bytes(unsigned int start, unsigned int len, unsigned int boundary) { @@ -487,22 +425,68 @@ static u16 ism_get_chid(struct dibs_dev *dibs) return to_zpci(ism->pdev)->pchid; } +static int ism_match_event_type(u32 s390_event_type) +{ + switch (s390_event_type) { + case ISM_EVENT_BUF: + return DIBS_BUF_EVENT; + case ISM_EVENT_DEV: + return DIBS_DEV_EVENT; + case ISM_EVENT_SWR: + return DIBS_SW_EVENT; + default: + return DIBS_OTHER_TYPE; + } +} + +static int ism_match_event_subtype(u32 s390_event_subtype) +{ + switch (s390_event_subtype) { + case ISM_BUF_DMB_UNREGISTERED: + return DIBS_BUF_UNREGISTERED; + case ISM_DEV_GID_DISABLED: + return DIBS_DEV_DISABLED; + case ISM_DEV_GID_ERR_STATE: + return DIBS_DEV_ERR_STATE; + default: + return DIBS_OTHER_SUBTYPE; + } +} + static void ism_handle_event(struct ism_dev *ism) { + struct dibs_dev *dibs = ism->dibs; + struct dibs_event event; struct ism_event *entry; - struct ism_client *clt; + struct dibs_client *clt; int i; while ((ism->ieq_idx + 1) != READ_ONCE(ism->ieq->header.idx)) { - if (++(ism->ieq_idx) == ARRAY_SIZE(ism->ieq->entry)) + if (++ism->ieq_idx == ARRAY_SIZE(ism->ieq->entry)) ism->ieq_idx = 0; entry = &ism->ieq->entry[ism->ieq_idx]; debug_event(ism_debug_info, 2, entry, sizeof(*entry)); - for (i = 0; i < max_client; ++i) { - clt = ism->subs[i]; + __memset(&event, 0, sizeof(event)); + event.type = ism_match_event_type(entry->type); + if (event.type == DIBS_SW_EVENT) + event.subtype = entry->code; + else + event.subtype = ism_match_event_subtype(entry->code); + event.time = entry->time; + event.data = entry->info; + switch (event.type) { + case DIBS_BUF_EVENT: + event.buffer_tok = entry->tok; + break; + case DIBS_DEV_EVENT: + case DIBS_SW_EVENT: + memcpy(&event.gid, &entry->tok, sizeof(u64)); + } + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { + clt = dibs->subs[i]; if (clt) - clt->handle_event(ism, entry); + clt->ops->handle_event(dibs, &event); } } } @@ -560,12 +544,13 @@ static const struct dibs_dev_ops ism_ops = { .move_data = ism_move, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, + .signal_event = ism_signal_ieq, }; static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; - int i, ret; + int ret; ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI); if (ret <= 0) @@ -584,18 +569,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < max_client; ++i) { - if (clients[i]) { - ism_setup_forwarding(clients[i], ism); - } - } - mutex_unlock(&clients_lock); - - list_add(&ism->list, &ism_dev_list.list); - mutex_unlock(&ism_dev_list.mutex); - query_info(ism); return 0; @@ -612,22 +585,11 @@ out: static void ism_dev_exit(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; - unsigned long flags; - int i; - - spin_lock_irqsave(&ism->lock, flags); - for (i = 0; i < max_client; ++i) - ism->subs[i] = NULL; - spin_unlock_irqrestore(&ism->lock, flags); - - mutex_lock(&ism_dev_list.mutex); unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); pci_free_irq_vectors(pdev); - list_del_init(&ism->list); - mutex_unlock(&ism_dev_list.mutex); } static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) @@ -641,7 +603,6 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!ism) return -ENOMEM; - spin_lock_init(&ism->lock); spin_lock_init(&ism->cmd_lock); dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; @@ -742,8 +703,6 @@ static int __init ism_init(void) if (!ism_debug_info) return -ENODEV; - memset(clients, 0, sizeof(clients)); - max_client = 0; debug_register_view(ism_debug_info, &debug_hex_ascii_view); ret = pci_register_driver(&ism_driver); if (ret) @@ -760,41 +719,3 @@ static void __exit ism_exit(void) module_init(ism_init); module_exit(ism_exit); - -/*************************** SMC-D Implementation *****************************/ - -#if IS_ENABLED(CONFIG_SMC) -static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info) -{ - union ism_sig_ieq cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.trigger_irq = trigger_irq; - cmd.request.event_code = event_code; - cmd.request.info = info; - - return ism_cmd(ism, &cmd); -} - -static int smcd_signal_ieq(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 trigger_irq, u32 event_code, u64 info) -{ - return ism_signal_ieq(smcd->priv, rgid->gid, - trigger_irq, event_code, info); -} - -static const struct smcd_ops ism_smcd_ops = { - .signal_event = smcd_signal_ieq, -}; - -const struct smcd_ops *ism_get_smcd_ops(void) -{ - return &ism_smcd_ops; -} -EXPORT_SYMBOL_GPL(ism_get_smcd_ops); -#endif diff --git a/include/linux/dibs.h b/include/linux/dibs.h index be009c614205..c75607f8a5cf 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -67,6 +67,41 @@ struct dibs_dmb { dma_addr_t dma_addr; }; +/* DIBS events + * ----------- + * Dibs devices can optionally notify dibs clients about events that happened + * in the fabric or at the remote device or remote dmb. + */ +enum dibs_event_type { + /* Buffer event, e.g. a remote dmb was unregistered */ + DIBS_BUF_EVENT, + /* Device event, e.g. a remote dibs device was disabled */ + DIBS_DEV_EVENT, + /* Software event, a dibs client can send an event signal to a + * remote dibs device. + */ + DIBS_SW_EVENT, + DIBS_OTHER_TYPE }; + +enum dibs_event_subtype { + DIBS_BUF_UNREGISTERED, + DIBS_DEV_DISABLED, + DIBS_DEV_ERR_STATE, + DIBS_OTHER_SUBTYPE +}; + +struct dibs_event { + u32 type; + u32 subtype; + /* uuid_null if invalid */ + uuid_t gid; + /* zero if invalid */ + u64 buffer_tok; + u64 time; + /* additional data or zero */ + u64 data; +}; + struct dibs_dev; /* DIBS client @@ -117,6 +152,15 @@ struct dibs_client_ops { */ void (*handle_irq)(struct dibs_dev *dev, unsigned int idx, u16 dmbemask); + /** + * handle_event() - Handle control information sent by device + * @dev: device reporting the event + * @event: ism event structure + * + * * Context: Called in IRQ context by dibs device driver + */ + void (*handle_event)(struct dibs_dev *dev, + const struct dibs_event *event); }; struct dibs_client { @@ -285,6 +329,24 @@ struct dibs_dev_ops { * Return: zero on success */ int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * signal_event() - trigger an event at a remote dibs device (optional) + * @dev: local dibs device + * @rgid: gid of remote dibs device + * trigger_irq: zero: notification may be coalesced with other events + * non-zero: notify immediately + * @subtype: 4 byte event code, meaning is defined by dibs client + * @data: 8 bytes of additional information, + * meaning is defined by dibs client + * + * dibs devices can offer support for sending a control event of type + * EVENT_SWR to a remote dibs device. + * NOTE: handle_event() will be called for all registered dibs clients + * at the remote device. + * Return: zero on success + */ + int (*signal_event)(struct dibs_dev *dev, const uuid_t *rgid, + u32 trigger_irq, u32 event_code, u64 info); /** * support_mmapped_rdmb() - can this device provide memory mapped * remote dmbs? (optional) diff --git a/include/net/smc.h b/include/net/smc.h index 8e3debcf7db5..08bee529ed8d 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -16,7 +16,6 @@ #include #include #include -#include "linux/ism.h" struct sock; @@ -28,28 +27,14 @@ struct smc_hashinfo { }; /* SMCD/ISM device driver interface */ -#define ISM_EVENT_DMB 0 -#define ISM_EVENT_GID 1 -#define ISM_EVENT_SWR 2 - #define ISM_RESERVED_VLANID 0x1FFF -struct smcd_dev; - struct smcd_gid { u64 gid; u64 gid_ext; }; -struct smcd_ops { - /* optional operations */ - int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 trigger_irq, u32 event_code, u64 info); -}; - struct smcd_dev { - const struct smcd_ops *ops; - void *priv; struct dibs_dev *dibs; struct list_head list; spinlock_t lock; diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 9535d88c2acb..99ecd59d1f4b 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -2,7 +2,6 @@ config SMC tristate "SMC socket protocol family" depends on INET && INFINIBAND && DIBS - depends on m || ISM != m help SMC-R provides a "sockets over RDMA" solution making use of RDMA over Converged Ethernet (RoCE) technology to upgrade diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 01e49371d23d..7b228ca2f96a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -17,7 +17,6 @@ #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" -#include "linux/ism.h" #include "linux/dibs.h" struct smcd_dev_list smcd_dev_list = { @@ -30,20 +29,15 @@ static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; static void smcd_register_dev(struct dibs_dev *dibs); static void smcd_unregister_dev(struct dibs_dev *dibs); -#if IS_ENABLED(CONFIG_ISM) -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); - -static struct ism_client smc_ism_client = { - .name = "SMC-D", - .handle_event = smcd_handle_event, -}; -#endif +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event); static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask); static struct dibs_client_ops smc_client_ops = { .add_dev = smcd_register_dev, .del_dev = smcd_unregister_dev, + .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; @@ -399,11 +393,10 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -#if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; - struct ism_event event; + struct dibs_event event; }; #define ISM_EVENT_REQUEST 0x0001 @@ -423,25 +416,27 @@ union smcd_sw_event_info { static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { - struct smcd_gid peer_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct dibs_dev *dibs = wrk->smcd->dibs; union smcd_sw_event_info ev_info; + struct smcd_gid peer_gid; + uuid_t ism_rgid; - ev_info.info = wrk->event.info; - switch (wrk->event.code) { + copy_to_smcdgid(&peer_gid, &wrk->event.gid); + ev_info.info = wrk->event.data; + switch (wrk->event.subtype) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST && - wrk->smcd->ops->signal_event) { + dibs->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; - wrk->smcd->ops->signal_event(wrk->smcd, - &peer_gid, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_TESTLINK, - ev_info.info); - } + copy_to_dibsgid(&ism_rgid, &peer_gid); + dibs->ops->signal_event(dibs, &ism_rgid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_TESTLINK, + ev_info.info); + } break; } } @@ -451,26 +446,24 @@ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); - struct smcd_gid smcd_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct smcd_gid smcd_gid; + + copy_to_smcdgid(&smcd_gid, &wrk->event.gid); switch (wrk->event.type) { - case ISM_EVENT_GID: /* GID event, token is peer GID */ + case DIBS_DEV_EVENT: /* GID event, token is peer GID */ smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; - case ISM_EVENT_DMB: + case DIBS_BUF_EVENT: break; - case ISM_EVENT_SWR: /* Software defined event */ + case DIBS_SW_EVENT: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } -#endif -static struct smcd_dev *smcd_alloc_dev(const char *name, - const struct smcd_ops *ops, - int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, int max_dmbs) { struct smcd_dev *smcd; @@ -487,8 +480,6 @@ static struct smcd_dev *smcd_alloc_dev(const char *name, if (!smcd->event_wq) goto free_conn; - smcd->ops = ops; - spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); @@ -506,36 +497,17 @@ free_smcd: static void smcd_register_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd, *fentry; - const struct smcd_ops *ops; - struct ism_dev *ism; int max_dmbs; max_dmbs = dibs->ops->max_dmbs(); - if (smc_ism_is_loopback(dibs)) { - ops = NULL; - } else { - ism = dibs->drv_priv; -#if IS_ENABLED(CONFIG_ISM) - ops = ism_get_smcd_ops(); -#endif - } - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, max_dmbs); + smcd = smcd_alloc_dev(dev_name(&dibs->dev), max_dmbs); if (!smcd) return; smcd->dibs = dibs; dibs_set_priv(dibs, &smc_dibs_client, smcd); - if (smc_ism_is_loopback(dibs)) { - smcd->priv = NULL; - } else { - smcd->priv = ism; -#if IS_ENABLED(CONFIG_ISM) - ism_set_priv(ism, &smc_ism_client, smcd); -#endif - } - if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); @@ -590,7 +562,6 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) kfree(smcd); } -#if IS_ENABLED(CONFIG_ISM) /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), @@ -602,9 +573,10 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) * Context: * - Function called in IRQ context from ISM device driver event handler. */ -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_ism_event_work *wrk; if (smcd->going_away) @@ -618,7 +590,6 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } -#endif /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. @@ -644,22 +615,22 @@ static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, int smc_ism_signal_shutdown(struct smc_link_group *lgr) { int rc = 0; -#if IS_ENABLED(CONFIG_ISM) union smcd_sw_event_info ev_info; + uuid_t ism_rgid; if (lgr->peer_shutdown) return 0; - if (!lgr->smcd->ops->signal_event) + if (!lgr->smcd->dibs->ops->signal_event) return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid, + copy_to_dibsgid(&ism_rgid, &lgr->peer_gid); + rc = lgr->smcd->dibs->ops->signal_event(lgr->smcd->dibs, &ism_rgid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); -#endif return rc; } @@ -670,9 +641,6 @@ int smc_ism_init(void) smc_ism_v2_capable = false; smc_ism_create_system_eid(); -#if IS_ENABLED(CONFIG_ISM) - rc = ism_register_client(&smc_ism_client); -#endif rc = dibs_register_client(&smc_dibs_client); return rc; } @@ -680,7 +648,4 @@ int smc_ism_init(void) void smc_ism_exit(void) { dibs_unregister_client(&smc_dibs_client); -#if IS_ENABLED(CONFIG_ISM) - ism_unregister_client(&smc_ism_client); -#endif } -- cgit v1.2.3 From 8f12d1137c2382c80aada8e05d7cc650cd4e403c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:49 -0700 Subject: bpf: Clear pfmemalloc flag when freeing all fragments It is possible for bpf_xdp_adjust_tail() to free all fragments. The kfunc currently clears the XDP_FLAGS_HAS_FRAGS bit, but not XDP_FLAGS_FRAGS_PF_MEMALLOC. So far, this has not caused a issue when building sk_buff from xdp_buff since all readers of xdp_buff->flags use the flag only when there are fragments. Clear the XDP_FLAGS_FRAGS_PF_MEMALLOC bit as well to make the flags correct. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Reviewed-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250922233356.3356453-2-ameryhung@gmail.com --- include/net/xdp.h | 5 +++++ net/core/filter.c | 1 + 2 files changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index b40f1f96cb11..f288c348a6c1 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -115,6 +115,11 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } +static __always_inline void xdp_buff_clear_frag_pfmemalloc(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_FRAGS_PF_MEMALLOC; +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { diff --git a/net/core/filter.c b/net/core/filter.c index 63f3baee2daf..5837534f4352 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4210,6 +4210,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); xdp->data_end -= offset; } -- cgit v1.2.3 From dea1526fbafb55099a788cde0b659530ee5b1c66 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:50 -0700 Subject: bpf: Allow bpf_xdp_shrink_data to shrink a frag from head and tail Move skb_frag_t adjustment into bpf_xdp_shrink_data() and extend its functionality to be able to shrink an xdp fragment from both head and tail. In a later patch, bpf_xdp_pull_data() will reuse it to shrink an xdp fragment from head. Additionally, in bpf_xdp_frags_shrink_tail(), breaking the loop when bpf_xdp_shrink_data() returns false (i.e., not releasing the current fragment) is not necessary as the loop condition, offset > 0, has the same effect. Remove the else branch to simplify the code. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Reviewed-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250922233356.3356453-3-ameryhung@gmail.com --- include/net/xdp_sock_drv.h | 21 ++++++++++++++++++--- net/core/filter.c | 41 ++++++++++++++++++++++++----------------- 2 files changed, 42 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 513c8e9704f6..4f2d3268a676 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -160,13 +160,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return ret; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) { - struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); list_del(&xskb->list_node); } +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_first_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + list_node); + return &frag->xdp; +} + static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); @@ -389,8 +399,13 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return NULL; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) +{ +} + +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) { + return NULL; } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) diff --git a/net/core/filter.c b/net/core/filter.c index 5837534f4352..8cae575ad437 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4153,34 +4153,45 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) return 0; } -static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - enum xdp_mem_type mem_type, bool release) +static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + bool tail, bool release) { - struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); + struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : + xsk_buff_get_head(xdp); if (release) { - xsk_buff_del_tail(zc_frag); - __xdp_return(0, mem_type, false, zc_frag); + xsk_buff_del_frag(zc_frag); } else { - zc_frag->data_end -= shrink; + if (tail) + zc_frag->data_end -= shrink; + else + zc_frag->data += shrink; } + + return zc_frag; } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, - int shrink) + int shrink, bool tail) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; + netmem_ref netmem = skb_frag_netmem(frag); + struct xdp_buff *zc_frag = NULL; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); - goto out; + netmem = 0; + zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); } - if (release) - __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); + if (release) { + __xdp_return(netmem, mem_type, false, zc_frag); + } else { + if (!tail) + skb_frag_off_add(frag, shrink); + skb_frag_size_sub(frag, shrink); + } -out: return release; } @@ -4198,12 +4209,8 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - if (bpf_xdp_shrink_data(xdp, frag, shrink)) { + if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) n_frags_free++; - } else { - skb_frag_size_sub(frag, shrink); - break; - } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; -- cgit v1.2.3 From b650bf0977d34c52befb31a9fa711534e11b220f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2025 10:42:40 +0000 Subject: udp: remove busylock and add per NUMA queues busylock was protecting UDP sockets against packet floods, but unfortunately was not protecting the host itself. Under stress, many cpus could spin while acquiring the busylock, and NIC had to drop packets. Or packets would be dropped in cpu backlog if RPS/RFS were in place. This patch replaces the busylock by intermediate lockless queues. (One queue per NUMA node). This means that fewer number of cpus have to acquire the UDP receive queue lock. Most of the cpus can either: - immediately drop the packet. - or queue it in their NUMA aware lockless queue. Then one of the cpu is chosen to process this lockless queue in a batch. The batch only contains packets that were cooked on the same NUMA node, thus with very limited latency impact. Tested: DDOS targeting a victim UDP socket, on a platform with 6 NUMA nodes (Intel(R) Xeon(R) 6985P-C) Before: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 1004179 0.0 Udp6InErrors 3117 0.0 Udp6RcvbufErrors 3117 0.0 After: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 1116633 0.0 Udp6InErrors 14197275 0.0 Udp6RcvbufErrors 14197275 0.0 We can see this host can now proces 14.2 M more packets per second while under attack, and the victim socket can receive 11 % more packets. I used a small bpftrace program measuring time (in us) spent in __udp_enqueue_schedule_skb(). Before: @udp_enqueue_us[398]: [0] 24901 |@@@ | [1] 63512 |@@@@@@@@@ | [2, 4) 344827 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [4, 8) 244673 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [8, 16) 54022 |@@@@@@@@ | [16, 32) 222134 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [32, 64) 232042 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [64, 128) 4219 | | [128, 256) 188 | | After: @udp_enqueue_us[398]: [0] 5608855 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [1] 1111277 |@@@@@@@@@@ | [2, 4) 501439 |@@@@ | [4, 8) 102921 | | [8, 16) 29895 | | [16, 32) 43500 | | [32, 64) 31552 | | [64, 128) 979 | | [128, 256) 13 | | Note that the remaining bottleneck for this platform is in udp_drops_inc() because we limited struct numa_drop_counters to only two nodes so far. Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Reviewed-by: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250922104240.2182559-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 9 +++- include/net/udp.h | 11 ++++- net/ipv4/udp.c | 117 +++++++++++++++++++++++++++++++--------------------- net/ipv6/udp.c | 5 ++- 4 files changed, 91 insertions(+), 51 deletions(-) (limited to 'include/net') diff --git a/include/linux/udp.h b/include/linux/udp.h index e554890c4415..58795688a186 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -44,6 +44,12 @@ enum { UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */ }; +/* per NUMA structure for lockless producer usage. */ +struct udp_prod_queue { + struct llist_head ll_root ____cacheline_aligned_in_smp; + atomic_t rmem_alloc; +}; + struct udp_sock { /* inet_sock has to be the first member */ struct inet_sock inet; @@ -90,6 +96,8 @@ struct udp_sock { struct sk_buff *skb, int nhoff); + struct udp_prod_queue *udp_prod_queue; + /* udp_recvmsg try to use this before splicing sk_receive_queue */ struct sk_buff_head reader_queue ____cacheline_aligned_in_smp; @@ -109,7 +117,6 @@ struct udp_sock { */ struct hlist_node tunnel_list; struct numa_drop_counters drop_counters; - spinlock_t busylock ____cacheline_aligned_in_smp; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index 059a0cee5f55..cffedb3e40f2 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -284,16 +284,23 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); -static inline void udp_lib_init_sock(struct sock *sk) +static inline int udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); sk->sk_drop_counters = &up->drop_counters; - spin_lock_init(&up->busylock); skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); + + up->udp_prod_queue = kcalloc(nr_node_ids, sizeof(*up->udp_prod_queue), + GFP_KERNEL); + if (!up->udp_prod_queue) + return -ENOMEM; + for (int i = 0; i < nr_node_ids; i++) + init_llist_head(&up->udp_prod_queue[i].ll_root); + return 0; } static inline void udp_drops_inc(struct sock *sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 85cfc32eb2cc..95241093b7f0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1685,25 +1685,6 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } -/* Idea of busylocks is to let producers grab an extra spinlock - * to relieve pressure on the receive_queue spinlock shared by consumer. - * Under flood, this means that only one producer can be in line - * trying to acquire the receive_queue spinlock. - */ -static spinlock_t *busylock_acquire(struct sock *sk) -{ - spinlock_t *busy = &udp_sk(sk)->busylock; - - spin_lock(busy); - return busy; -} - -static void busylock_release(spinlock_t *busy) -{ - if (busy) - spin_unlock(busy); -} - static int udp_rmem_schedule(struct sock *sk, int size) { int delta; @@ -1718,14 +1699,24 @@ static int udp_rmem_schedule(struct sock *sk, int size) int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; + struct udp_prod_queue *udp_prod_queue; + struct sk_buff *next, *to_drop = NULL; + struct llist_node *ll_list; unsigned int rmem, rcvbuf; - spinlock_t *busy = NULL; int size, err = -ENOMEM; + int total_size = 0; + int q_size = 0; + int dropcount; + int nb = 0; rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); size = skb->truesize; + udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()]; + + rmem += atomic_read(&udp_prod_queue->rmem_alloc); + /* Immediately drop when the receive queue is full. * Cast to unsigned int performs the boundary check for INT_MAX. */ @@ -1747,45 +1738,77 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rmem > (rcvbuf >> 1)) { skb_condense(skb); size = skb->truesize; - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > rcvbuf) - goto uncharge_drop; - busy = busylock_acquire(sk); - } else { - atomic_add(size, &sk->sk_rmem_alloc); } udp_set_dev_scratch(skb); + atomic_add(size, &udp_prod_queue->rmem_alloc); + + if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root)) + return 0; + + dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0; + spin_lock(&list->lock); - err = udp_rmem_schedule(sk, size); - if (err) { - spin_unlock(&list->lock); - goto uncharge_drop; - } - sk_forward_alloc_add(sk, -size); + ll_list = llist_del_all(&udp_prod_queue->ll_root); - /* no need to setup a destructor, we will explicitly release the - * forward allocated memory on dequeue - */ - sock_skb_set_dropcount(sk, skb); + ll_list = llist_reverse_order(ll_list); + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + size = udp_skb_truesize(skb); + total_size += size; + err = udp_rmem_schedule(sk, size); + if (unlikely(err)) { + /* Free the skbs outside of locked section. */ + skb->next = to_drop; + to_drop = skb; + continue; + } + + q_size += size; + sk_forward_alloc_add(sk, -size); + + /* no need to setup a destructor, we will explicitly release the + * forward allocated memory on dequeue + */ + SOCK_SKB_CB(skb)->dropcount = dropcount; + nb++; + __skb_queue_tail(list, skb); + } + + atomic_add(q_size, &sk->sk_rmem_alloc); - __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); + if (!sock_flag(sk, SOCK_DEAD)) { + /* Multiple threads might be blocked in recvmsg(), + * using prepare_to_wait_exclusive(). + */ + while (nb) { + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + nb--; + } + } + + if (unlikely(to_drop)) { + for (nb = 0; to_drop != NULL; nb++) { + skb = to_drop; + to_drop = skb->next; + skb_mark_not_on_list(skb); + /* TODO: update SNMP values. */ + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); + } + numa_drop_add(&udp_sk(sk)->drop_counters, nb); + } - busylock_release(busy); - return 0; + atomic_sub(total_size, &udp_prod_queue->rmem_alloc); -uncharge_drop: - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + return 0; drop: udp_drops_inc(sk); - busylock_release(busy); return err; } EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); @@ -1803,6 +1826,7 @@ void udp_destruct_common(struct sock *sk) kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); + kfree(up->udp_prod_queue); } EXPORT_IPV6_MOD_GPL(udp_destruct_common); @@ -1814,10 +1838,11 @@ static void udp_destruct_sock(struct sock *sk) int udp_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udp_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9f4d340d1e3a..813a2ba75824 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -67,10 +67,11 @@ static void udpv6_destruct_sock(struct sock *sk) int udpv6_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udpv6_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } INDIRECT_CALLABLE_SCOPE -- cgit v1.2.3 From 25c550464acd40803d63868dfa4a42506df48b88 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:04 +0200 Subject: net: gro: remove is_ipv6 from napi_gro_cb Remove is_ipv6 from napi_gro_cb and use sk->sk_family instead. This frees up space for another ip_fixedid bit that will be added in the next commit. udp_sock_create always creates either a AF_INET or a AF_INET6 socket, so using sk->sk_family is reliable. In IPv6-FOU, cfg->ipv6_v6only is always enabled. Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-2-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- include/net/gro.h | 3 --- net/ipv4/fou_core.c | 32 ++++++++++++++------------------ net/ipv4/udp_offload.c | 2 -- net/ipv6/udp_offload.c | 2 -- 4 files changed, 14 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/include/net/gro.h b/include/net/gro.h index a0fca7ac6e7e..87c68007f949 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -71,9 +71,6 @@ struct napi_gro_cb { /* Free the skb? */ u8 free:2; - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; - /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1; diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 3e30745e2c09..3970b6b7ace5 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -228,21 +228,27 @@ drop: return 0; } +static const struct net_offload *fou_gro_ops(const struct sock *sk, + int proto) +{ + const struct net_offload __rcu **offloads; + + /* FOU doesn't allow IPv4 on IPv6 sockets. */ + offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads; + return rcu_dereference(offloads[proto]); +} + static struct sk_buff *fou_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; struct sk_buff *pp = NULL; - u8 proto; if (!fou) goto out; - proto = fou->protocol; - /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are simply @@ -254,8 +260,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, fou->protocol); if (!ops || !ops->callbacks.gro_receive) goto out; @@ -268,10 +273,8 @@ out: static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { - const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; - u8 proto; int err; if (!fou) { @@ -279,10 +282,7 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, goto out; } - proto = fou->protocol; - - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, fou->protocol); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) { err = -ENOSYS; goto out; @@ -323,7 +323,6 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload __rcu **offloads; const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; @@ -450,8 +449,7 @@ next_proto: /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, proto); if (!ops || !ops->callbacks.gro_receive) goto out; @@ -467,7 +465,6 @@ out: static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); - const struct net_offload __rcu **offloads; const struct net_offload *ops; unsigned int guehlen = 0; u8 proto; @@ -494,8 +491,7 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) return err; } - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, proto); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b1f3fd302e9d..19d0b5b09ffa 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -891,8 +891,6 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 0; - if (static_branch_unlikely(&udp_encap_needed_key)) sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index d8445ac1b2e4..046f13b1d77a 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -154,8 +154,6 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) ip6_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 1; - if (static_branch_unlikely(&udpv6_encap_needed_key)) sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest); -- cgit v1.2.3 From 21f7484220ace6c355cb0023d14d83da6fe5843d Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:05 +0200 Subject: net: gro: only merge packets with incrementing or fixed outer ids Only merge encapsulated packets if their outer IDs are either incrementing or fixed, just like for inner IDs and IDs of non-encapsulated packets. Add another ip_fixedid bit for a total of two bits: one for outer IDs (and for unencapsulated packets) and one for inner IDs. This commit preserves the current behavior of GSO where only the IDs of the inner-most headers are restored correctly. Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-3-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- include/net/gro.h | 26 +++++++++++--------------- net/ipv4/tcp_offload.c | 5 ++++- 2 files changed, 15 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/gro.h b/include/net/gro.h index 87c68007f949..e7997a9fb30b 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -75,7 +75,7 @@ struct napi_gro_cb { u8 is_fou:1; /* Used to determine if ipid_offset can be ignored */ - u8 ip_fixedid:1; + u8 ip_fixedid:2; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; @@ -442,29 +442,26 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, } static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, - struct sk_buff *p, bool outer) + struct sk_buff *p, bool inner) { const u32 id = ntohl(*(__be32 *)&iph->id); const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; const u32 df = id & IP_DF; - int flush; /* All fields must match except length and checksum. */ - flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF)); - - if (flush | (outer && df)) - return flush; + if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF))) + return true; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ if (count == 1 && df && !ipid_offset) - NAPI_GRO_CB(p)->ip_fixedid = true; + NAPI_GRO_CB(p)->ip_fixedid |= 1 << inner; - return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid); + return ipid_offset ^ (count * !(NAPI_GRO_CB(p)->ip_fixedid & (1 << inner))); } static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) @@ -479,7 +476,7 @@ static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr static inline int __gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p, const u16 diff, - bool outer) + bool inner) { const void *nh = th - diff; const void *nh2 = th2 - diff; @@ -487,19 +484,18 @@ static inline int __gro_receive_network_flush(const void *th, const void *th2, if (((struct iphdr *)nh)->version == 6) return ipv6_gro_flush(nh, nh2); else - return inet_gro_flush(nh, nh2, p, outer); + return inet_gro_flush(nh, nh2, p, inner); } static inline int gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p) { - const bool encap_mark = NAPI_GRO_CB(p)->encap_mark; int off = skb_transport_offset(p); int flush; - flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark); - if (encap_mark) - flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false); + flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, false); + if (NAPI_GRO_CB(p)->encap_mark) + flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, true); return flush; } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index e6612bd84d09..1949eede9ec9 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -471,6 +471,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); + bool is_fixedid; if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; @@ -484,8 +485,10 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); + is_fixedid = (NAPI_GRO_CB(skb)->ip_fixedid >> skb->encapsulation) & 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | - (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); + (is_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0; -- cgit v1.2.3 From f095a358faf263bf1d8ae712bd38e13b71286819 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:07 +0200 Subject: net: gro: remove unnecessary df checks Currently, packets with fixed IDs will be merged only if their don't-fragment bit is set. This restriction is unnecessary since packets without the don't-fragment bit will be forwarded as-is even if they were merged together. The merged packets will be segmented into their original forms before being forwarded, either by GSO or by TSO. The IDs will also remain identical unless NETIF_F_TSO_MANGLEID is set, in which case the IDs can become incrementing, which is also fine. Clean up the code by removing the unnecessary don't-fragment checks. Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-5-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- include/net/gro.h | 5 ++--- net/ipv4/af_inet.c | 3 --- tools/testing/selftests/net/gro.c | 9 ++++----- 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/gro.h b/include/net/gro.h index e7997a9fb30b..e3affb2e2ca8 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -448,17 +448,16 @@ static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *ip const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; - const u32 df = id & IP_DF; /* All fields must match except length and checksum. */ - if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF))) + if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | ((id ^ id2) & IP_DF)) return true; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ - if (count == 1 && df && !ipid_offset) + if (count == 1 && !ipid_offset) NAPI_GRO_CB(p)->ip_fixedid |= 1 << inner; return ipid_offset ^ (count * !(NAPI_GRO_CB(p)->ip_fixedid & (1 << inner))); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 804c51296c55..3109c5ec38f3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1395,10 +1395,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - /* fixed ID is invalid if DF bit is not set */ fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap)); - if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto out; if (!skb->encapsulation || encap) udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index d5824eadea10..3d4a82a2607c 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -670,7 +670,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(9); break; - case 3: /* DF=0, Fixed - should not coalesce */ + case 3: /* DF=0, Fixed - should coalesce */ iph1->frag_off &= ~htons(IP_DF); iph1->id = htons(8); @@ -1188,10 +1188,9 @@ static void gro_receiver(void) correct_payload[0] = PAYLOAD_LEN * 2; check_recv_pkts(rxfd, correct_payload, 1); - printf("DF=0, Fixed - should not coalesce: "); - correct_payload[0] = PAYLOAD_LEN; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); + printf("DF=0, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); correct_payload[0] = PAYLOAD_LEN * 2; -- cgit v1.2.3 From d4e99db3d942c8099006f3b7536bc52f766b475a Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 10 Aug 2025 23:53:20 +0200 Subject: Bluetooth: Annotate struct hci_drv_rp_read_info with __counted_by_le() Add the __counted_by_le() compiler attribute to the flexible array member 'supported_commands' to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Signed-off-by: Thorsten Blum Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_drv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_drv.h b/include/net/bluetooth/hci_drv.h index 2f01c44f05ec..3fd6fdbdb02e 100644 --- a/include/net/bluetooth/hci_drv.h +++ b/include/net/bluetooth/hci_drv.h @@ -47,7 +47,7 @@ struct hci_drv_ev_cmd_complete { struct hci_drv_rp_read_info { __u8 driver_name[HCI_DRV_MAX_DRIVER_NAME_LENGTH]; __le16 num_supported_commands; - __le16 supported_commands[]; + __le16 supported_commands[] __counted_by_le(num_supported_commands); } __packed; /* Driver specific OGF (Opcode Group Field) -- cgit v1.2.3 From 339a87883a14d6a818ca436fed41aa5d10e0f4bd Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Aug 2025 15:21:19 -0400 Subject: Bluetooth: ISO: Use sk_sndtimeo as conn_timeout This aligns the usage of socket sk_sndtimeo as conn_timeout when initiating a connection and then use it when scheduling the resulting HCI command, similar to what has been done in bf98feea5b65 ("Bluetooth: hci_conn: Always use sk_timeo as conn_timeout"). Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 10 ++++++---- net/bluetooth/hci_conn.c | 20 ++++++++++++-------- net/bluetooth/iso.c | 16 ++++++++++------ 3 files changed, 28 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6560b32f3125..a068beae9318 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1587,16 +1587,18 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, __u16 setting, struct bt_codec *codec, u16 timeout); struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos); + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout); struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base); + __u8 base_len, __u8 *base, u16 timeout); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos); + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout); struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos, - __u8 data_len, __u8 *data); + __u8 data_len, __u8 *data, u16 timeout); struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 091cff2155e6..111f0e37b672 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1539,7 +1539,7 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; @@ -1581,6 +1581,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, conn->state = BT_CONNECT; conn->sid = sid; + conn->conn_timeout = timeout; hci_conn_hold(conn); return conn; @@ -1921,7 +1922,8 @@ done: } struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos) + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout) { struct hci_conn *cis; @@ -1936,6 +1938,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, cis->dst_type = dst_type; cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET; cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET; + cis->conn_timeout = timeout; } if (cis->state == BT_CONNECTED) @@ -2175,7 +2178,7 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; struct hci_conn *parent; @@ -2196,7 +2199,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ - conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir); + conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir, timeout); if (IS_ERR(conn)) return conn; @@ -2249,13 +2252,13 @@ static void bis_mark_per_adv(struct hci_conn *conn, void *data) struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; struct iso_list_data data; - conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base); + conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base, timeout); if (IS_ERR(conn)) return conn; @@ -2298,7 +2301,8 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, } struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos) + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout) { struct hci_conn *le; struct hci_conn *cis; @@ -2322,7 +2326,7 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, hci_iso_qos_setup(hdev, le, &qos->ucast.in, le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys); - cis = hci_bind_cis(hdev, dst, dst_type, qos); + cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout); if (IS_ERR(cis)) { hci_conn_drop(le); return cis; diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index ac6e83313b9b..5c68c0ea7d97 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -91,8 +91,8 @@ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ -#define ISO_CONN_TIMEOUT (HZ * 40) -#define ISO_DISCONN_TIMEOUT (HZ * 2) +#define ISO_CONN_TIMEOUT secs_to_jiffies(20) +#define ISO_DISCONN_TIMEOUT secs_to_jiffies(2) static void iso_conn_free(struct kref *ref) { @@ -367,7 +367,8 @@ static int iso_connect_bis(struct sock *sk) if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, iso_pi(sk)->base_len, - iso_pi(sk)->base); + iso_pi(sk)->base, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -376,7 +377,8 @@ static int iso_connect_bis(struct sock *sk) hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, - iso_pi(sk)->base_len, iso_pi(sk)->base); + iso_pi(sk)->base_len, iso_pi(sk)->base, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -469,7 +471,8 @@ static int iso_connect_cis(struct sock *sk) if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos); + &iso_pi(sk)->qos, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -477,7 +480,8 @@ static int iso_connect_cis(struct sock *sk) } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos); + &iso_pi(sk)->qos, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; -- cgit v1.2.3 From c9beb36c14660713b948e289b1e352cc3d386d44 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Aug 2025 15:57:39 -0400 Subject: Bluetooth: hci_core: Detect if an ISO link has stalled This attempts to detect if an ISO link has been waiting for an ISO buffer for longer than the maximum allowed transport latency then proceed to use hci_link_tx_to which prints an error and disconnects. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 34 ++++++++++++++++++++++++++-------- 3 files changed, 28 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index df1847b74e55..9ecc70baaca9 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -488,6 +488,7 @@ enum { #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_ACL_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ +#define HCI_ISO_TX_TIMEOUT usecs_to_jiffies(0x7fffff) /* 8388607 usecs */ /* HCI data types */ #define HCI_COMMAND_PKT 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a068beae9318..2924c2bf2a98 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -487,6 +487,7 @@ struct hci_dev { unsigned long acl_last_tx; unsigned long le_last_tx; + unsigned long iso_last_tx; __u8 le_tx_def_phys; __u8 le_rx_def_phys; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e2bffad9816f..4cf4bb1187dc 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3585,24 +3585,37 @@ static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { - unsigned long last_tx; + unsigned long timeout; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return; switch (type) { + case ACL_LINK: + /* tx timeout must be longer than maximum link supervision + * timeout (40.9 seconds) + */ + timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT; + break; case LE_LINK: - last_tx = hdev->le_last_tx; + /* tx timeout must be longer than maximum link supervision + * timeout (40.9 seconds) + */ + timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT; break; - default: - last_tx = hdev->acl_last_tx; + case CIS_LINK: + case BIS_LINK: + case PA_LINK: + /* tx timeout must be longer than the maximum transport latency + * (8.388607 seconds) + */ + timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT; break; + default: + return; } - /* tx timeout must be longer than maximum link supervision timeout - * (40.9 seconds) - */ - if (!cnt && time_after(jiffies, last_tx + HCI_ACL_TX_TIMEOUT)) + if (!cnt && time_after(jiffies, timeout)) hci_link_tx_to(hdev, type); } @@ -3759,10 +3772,15 @@ static void hci_sched_iso(struct hci_dev *hdev, __u8 type) return; cnt = &hdev->iso_cnt; + + __check_timeout(hdev, *cnt, type); + while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); + hci_send_conn_frame(hdev, conn, skb); + hdev->iso_last_tx = jiffies; conn->sent++; if (conn->sent == ~0) -- cgit v1.2.3 From 9eb14331885b09adfa7fe69a5a4603e24909c88f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 18 Aug 2025 16:43:53 -0400 Subject: Bluetooth: Add function and line information to bt_dbg When enabling debug via CONFIG_BT_FEATURE_DEBUG include function and line information by default otherwise it is hard to make any sense of which function the logs comes from. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index e5751f3070b8..d46ed9011ee5 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -272,7 +272,8 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) #if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG) -#define BT_DBG(fmt, ...) bt_dbg(fmt "\n", ##__VA_ARGS__) +#define BT_DBG(fmt, ...) \ + bt_dbg("%s:%d: " fmt "\n", __func__, __LINE__, ##__VA_ARGS__) #else #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif -- cgit v1.2.3 From be812ace0378a9db86344ad637c5ed2a5d11f216 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 9 Sep 2025 14:13:35 +0200 Subject: Bluetooth: Avoid a couple dozen -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Use the __struct_group() helper to fix 31 instances of the following type of warnings: 30 net/bluetooth/mgmt_config.c:16:33: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] 1 net/bluetooth/mgmt_config.c:22:33: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 9 +++++++-- net/bluetooth/mgmt_config.c | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 3575cd16049a..74edea06985b 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -53,10 +53,15 @@ struct mgmt_hdr { } __packed; struct mgmt_tlv { - __le16 type; - __u8 length; + /* New members MUST be added within the __struct_group() macro below. */ + __struct_group(mgmt_tlv_hdr, __hdr, __packed, + __le16 type; + __u8 length; + ); __u8 value[]; } __packed; +static_assert(offsetof(struct mgmt_tlv, value) == sizeof(struct mgmt_tlv_hdr), + "struct member likely outside of __struct_group()"); struct mgmt_addr_info { bdaddr_t bdaddr; diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c index 6ef701c27da4..c4063d200c0a 100644 --- a/net/bluetooth/mgmt_config.c +++ b/net/bluetooth/mgmt_config.c @@ -13,13 +13,13 @@ #define HDEV_PARAM_U16(_param_name_) \ struct {\ - struct mgmt_tlv entry; \ + struct mgmt_tlv_hdr entry; \ __le16 value; \ } __packed _param_name_ #define HDEV_PARAM_U8(_param_name_) \ struct {\ - struct mgmt_tlv entry; \ + struct mgmt_tlv_hdr entry; \ __u8 value; \ } __packed _param_name_ -- cgit v1.2.3 From a7556779745c047efb7b0ce8732889b0cdc80936 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Sep 2025 11:40:38 +0200 Subject: tcp: make tcp_rcvbuf_grow() accessible to mptcp code To leverage the auto-tuning improvements brought by commit 2da35e4b4df9 ("Merge branch 'tcp-receive-side-improvements'"), the MPTCP stack need to access the mentioned helper. Acked-by: Geliang Tang Acked-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-2-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7c51a0a5ace8..5ca230ed526a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -370,6 +370,7 @@ void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); +void tcp_rcvbuf_grow(struct sock *sk); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0a2511ce34db..b44fdc309633 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -891,7 +891,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, } } -static void tcp_rcvbuf_grow(struct sock *sk) +void tcp_rcvbuf_grow(struct sock *sk) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From 7d452516b67add4a53e63bfa496d8df930a66b9a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Sep 2025 18:21:12 +0000 Subject: Revert "net: group sk_backlog and sk_receive_queue" This reverts commit 4effb335b5dab08cb6e2c38d038910f8b527cfc9. This was a benefit for UDP flood case, which was later greatly improved with commits 6471658dc66c ("udp: use skb_attempt_defer_free()") and b650bf0977d3 ("udp: remove busylock and add per NUMA queues"). Apparently blamed commit added a regression for RAW sockets, possibly because they do not use the dual RX queue strategy that UDP has. sock_queue_rcv_skb_reason() and RAW recvmsg() compete for sk_receive_buf and sk_rmem_alloc changes, and them being in the same cache line reduce performance. Fixes: 4effb335b5da ("net: group sk_backlog and sk_receive_queue") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202509281326.f605b4eb-lkp@intel.com Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: David Ahern Cc: Kuniyuki Iwashima Link: https://patch.msgid.link/20250929182112.824154-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 8c5b64f41ab7..60bcb13f045c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -395,6 +395,7 @@ struct sock { atomic_t sk_drops; __s32 sk_peek_off; + struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -412,7 +413,6 @@ struct sock { } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc - struct sk_buff_head sk_error_queue; __cacheline_group_end(sock_write_rx); __cacheline_group_begin(sock_read_rx); -- cgit v1.2.3 From 5b66169f6be4847008c0aea50885ff0632151479 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 25 Sep 2025 02:33:03 +0000 Subject: bonding: fix xfrm offload feature setup on active-backup mode The active-backup bonding mode supports XFRM ESP offload. However, when a bond is added using command like `ip link add bond0 type bond mode 1 miimon 100`, the `ethtool -k` command shows that the XFRM ESP offload is disabled. This occurs because, in bond_newlink(), we change bond link first and register bond device later. So the XFRM feature update in bond_option_mode_set() is not called as the bond device is not yet registered, leading to the offload feature not being set successfully. To resolve this issue, we can modify the code order in bond_newlink() to ensure that the bond device is registered first before changing the bond link parameters. This change will allow the XFRM ESP offload feature to be correctly enabled. Fixes: 007ab5345545 ("bonding: fix feature flag setting at init time") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250925023304.472186-1-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/bonding/bond_netlink.c | 16 +++++++++------- include/net/bonding.h | 1 + 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 57be04f6cb11..f4f0feddd9fa 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4411,7 +4411,7 @@ void bond_work_init_all(struct bonding *bond) INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler); } -static void bond_work_cancel_all(struct bonding *bond) +void bond_work_cancel_all(struct bonding *bond) { cancel_delayed_work_sync(&bond->mii_work); cancel_delayed_work_sync(&bond->arp_work); diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 57fff2421f1b..7a9d73ec8e91 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -579,20 +579,22 @@ static int bond_newlink(struct net_device *bond_dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct bonding *bond = netdev_priv(bond_dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; int err; - err = bond_changelink(bond_dev, tb, data, extack); - if (err < 0) + err = register_netdevice(bond_dev); + if (err) return err; - err = register_netdevice(bond_dev); - if (!err) { - struct bonding *bond = netdev_priv(bond_dev); + netif_carrier_off(bond_dev); + bond_work_init_all(bond); - netif_carrier_off(bond_dev); - bond_work_init_all(bond); + err = bond_changelink(bond_dev, tb, data, extack); + if (err) { + bond_work_cancel_all(bond); + unregister_netdevice(bond_dev); } return err; diff --git a/include/net/bonding.h b/include/net/bonding.h index e06f0d63b2c1..bd56ad976cfb 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -711,6 +711,7 @@ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); void bond_work_init_all(struct bonding *bond); +void bond_work_cancel_all(struct bonding *bond); #ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond); -- cgit v1.2.3 From 5628f3fe3b16114e8424bbfcf0594caef8958a06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:34 +0000 Subject: net: add NUMA awareness to skb_attempt_defer_free() Instead of sharing sd->defer_list & sd->defer_count with many cpus, add one pair for each NUMA node. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 4 ---- include/net/hotdata.h | 7 +++++++ net/core/dev.c | 35 +++++++++++++++++++++++------------ net/core/dev.h | 2 +- net/core/skbuff.c | 11 ++++++----- 5 files changed, 37 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5c9aa16933d1..d1a687444b27 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3536,10 +3536,6 @@ struct softnet_data { struct numa_drop_counters drop_counters; - /* Another possibly contended cache line */ - struct llist_head defer_list ____cacheline_aligned_in_smp; - atomic_long_t defer_count; - int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; diff --git a/include/net/hotdata.h b/include/net/hotdata.h index fda94b2647ff..4acec191c54a 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -2,10 +2,16 @@ #ifndef _NET_HOTDATA_H #define _NET_HOTDATA_H +#include #include #include #include +struct skb_defer_node { + struct llist_head defer_list; + atomic_long_t defer_count; +} ____cacheline_aligned_in_smp; + /* Read mostly data used in network fast paths. */ struct net_hotdata { #if IS_ENABLED(CONFIG_INET) @@ -30,6 +36,7 @@ struct net_hotdata { struct rps_sock_flow_table __rcu *rps_sock_flow_table; u32 rps_cpu_mask; #endif + struct skb_defer_node __percpu *skb_defer_nodes; int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; diff --git a/net/core/dev.c b/net/core/dev.c index fb67372774de..a64cef2c537e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5180,8 +5180,9 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +void kick_defer_list_purge(unsigned int cpu) { + struct softnet_data *sd = &per_cpu(softnet_data, cpu); unsigned long flags; if (use_backlog_threads()) { @@ -6715,18 +6716,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -static void skb_defer_free_flush(struct softnet_data *sd) +static void skb_defer_free_flush(void) { struct llist_node *free_list; struct sk_buff *skb, *next; + struct skb_defer_node *sdn; + int node; - if (llist_empty(&sd->defer_list)) - return; - atomic_long_set(&sd->defer_count, 0); - free_list = llist_del_all(&sd->defer_list); + for_each_node(node) { + sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node; + + if (llist_empty(&sdn->defer_list)) + continue; + atomic_long_set(&sdn->defer_count, 0); + free_list = llist_del_all(&sdn->defer_list); - llist_for_each_entry_safe(skb, next, free_list, ll_node) { - napi_consume_skb(skb, 1); + llist_for_each_entry_safe(skb, next, free_list, ll_node) { + napi_consume_skb(skb, 1); + } } } @@ -6854,7 +6861,7 @@ count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); - skb_defer_free_flush(this_cpu_ptr(&softnet_data)); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7713,7 +7720,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) local_irq_disable(); net_rps_action_and_irq_enable(sd); } - skb_defer_free_flush(sd); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7755,7 +7762,7 @@ start: for (;;) { struct napi_struct *n; - skb_defer_free_flush(sd); + skb_defer_free_flush(); if (list_empty(&list)) { if (list_empty(&repoll)) { @@ -12989,7 +12996,6 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - init_llist_head(&sd->defer_list); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; @@ -12999,6 +13005,11 @@ static int __init net_dev_init(void) if (net_page_pool_create(i)) goto out; } + net_hotdata.skb_defer_nodes = + __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids, + __alignof__(struct skb_defer_node)); + if (!net_hotdata.skb_defer_nodes) + goto out; if (use_backlog_threads()) smpboot_register_percpu_thread(&backlog_threads); diff --git a/net/core/dev.h b/net/core/dev.h index d6b08d435479..900880e8b5b4 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -357,7 +357,7 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi) WARN_ON(READ_ONCE(napi->list_owner) != -1); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); +void kick_defer_list_purge(unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 17455fc1e692..bc12790017b0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7185,9 +7185,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + struct skb_defer_node *sdn; unsigned long defer_count; int cpu = skb->alloc_cpu; - struct softnet_data *sd; unsigned int defer_max; bool kick; @@ -7201,14 +7201,15 @@ nodefer: kfree_skb_napi_cache(skb); DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(skb->destructor); - sd = &per_cpu(softnet_data, cpu); + sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - defer_count = atomic_long_inc_return(&sd->defer_count); + defer_count = atomic_long_inc_return(&sdn->defer_count); if (defer_count >= defer_max) goto nodefer; - llist_add(&skb->ll_node, &sd->defer_list); + llist_add(&skb->ll_node, &sdn->defer_list); /* Send an IPI every time queue reaches half capacity. */ kick = (defer_count - 1) == (defer_max >> 1); @@ -7217,7 +7218,7 @@ nodefer: kfree_skb_napi_cache(skb); * if we are unlucky enough (this seems very unlikely). */ if (unlikely(kick)) - kick_defer_list_purge(sd, cpu); + kick_defer_list_purge(cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, -- cgit v1.2.3 From 7a0f94361ffd6e1d31c79023e8674b492bef05e3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 30 Sep 2025 19:24:26 -0700 Subject: net: psp: don't assume reply skbs will have a socket Rx path may be passing around unreferenced sockets, which means that skb_set_owner_edemux() may not set skb->sk and PSP will crash: KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] RIP: 0010:psp_reply_set_decrypted (./include/net/psp/functions.h:132 net/psp/psp_sock.c:287) tcp_v6_send_response.constprop.0 (net/ipv6/tcp_ipv6.c:979) tcp_v6_send_reset (net/ipv6/tcp_ipv6.c:1140 (discriminator 1)) tcp_v6_do_rcv (net/ipv6/tcp_ipv6.c:1683) tcp_v6_rcv (net/ipv6/tcp_ipv6.c:1912) Fixes: 659a2899a57d ("tcp: add datapath logic for PSP with inline key exchange") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251001022426.2592750-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/psp/functions.h | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/psp/psp_sock.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index ef7743664da3..c5c23a54774e 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -34,7 +34,7 @@ unsigned int psp_key_size(u32 version); void psp_sk_assoc_free(struct sock *sk); void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); void psp_twsk_assoc_free(struct inet_timewait_sock *tw); -void psp_reply_set_decrypted(struct sk_buff *skb); +void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb); static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { @@ -160,7 +160,7 @@ static inline void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void -psp_reply_set_decrypted(struct sk_buff *skb) { } +psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { } static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5ca97ede979c..ff11d3a85a36 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1668,7 +1668,7 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, nskb->ip_summed = CHECKSUM_NONE; if (orig_sk) { skb_set_owner_edemux(nskb, (struct sock *)orig_sk); - psp_reply_set_decrypted(nskb); + psp_reply_set_decrypted(orig_sk, nskb); } if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9622c2776ade..59c4977a811a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -974,7 +974,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); - psp_reply_set_decrypted(buff); + psp_reply_set_decrypted(sk, buff); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index 5324a7603bed..a931d825d1cc 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -279,12 +279,12 @@ void psp_twsk_assoc_free(struct inet_timewait_sock *tw) psp_assoc_put(pas); } -void psp_reply_set_decrypted(struct sk_buff *skb) +void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { struct psp_assoc *pas; rcu_read_lock(); - pas = psp_sk_get_assoc_rcu(skb->sk); + pas = psp_sk_get_assoc_rcu(sk); if (pas && pas->tx.spi) skb->decrypted = 1; rcu_read_unlock(); -- cgit v1.2.3 From 21f4d45eba0b2dcae5dbc9e5e0ad08735c993f16 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 9 Oct 2025 16:02:19 +0100 Subject: net/ip6_tunnel: Prevent perpetual tunnel growth Similarly to ipv4 tunnel, ipv6 version updates dev->needed_headroom, too. While ipv4 tunnel headroom adjustment growth was limited in commit 5ae1e9922bbd ("net: ip_tunnel: prevent perpetual headroom growth"), ipv6 tunnel yet increases the headroom without any ceiling. Reflect ipv4 tunnel headroom adjustment limit on ipv6 version. Credits to Francesco Ruggeri, who was originally debugging this issue and wrote local Arista-specific patch and a reproducer. Fixes: 8eb30be0352d ("ipv6: Create ip6_tnl_xmit") Cc: Florian Westphal Cc: Francesco Ruggeri Signed-off-by: Dmitry Safonov Link: https://patch.msgid.link/20251009-ip6_tunnel-headroom-v2-1-8e4dbd8f7e35@arista.com Signed-off-by: Jakub Kicinski --- include/net/ip_tunnels.h | 15 +++++++++++++++ net/ipv4/ip_tunnel.c | 14 -------------- net/ipv6/ip6_tunnel.c | 3 +-- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 4314a97702ea..ecae35512b9b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -611,6 +611,21 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply); +static inline void ip_tunnel_adj_headroom(struct net_device *dev, + unsigned int headroom) +{ + /* we must cap headroom to some upperlimit, else pskb_expand_head + * will overflow header offsets in skb_headers_offset_update(). + */ + const unsigned int max_allowed = 512; + + if (headroom > max_allowed) + headroom = max_allowed; + + if (headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, headroom); +} + int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline int iptunnel_pull_offloads(struct sk_buff *skb) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index aaeb5d16f0c9..158a30ae7c5f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -568,20 +568,6 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, return 0; } -static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom) -{ - /* we must cap headroom to some upperlimit, else pskb_expand_head - * will overflow header offsets in skb_headers_offset_update(). - */ - static const unsigned int max_allowed = 512; - - if (headroom > max_allowed) - headroom = max_allowed; - - if (headroom > READ_ONCE(dev->needed_headroom)) - WRITE_ONCE(dev->needed_headroom, headroom); -} - void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto, int tunnel_hlen) { diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3262e81223df..6405072050e0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1257,8 +1257,7 @@ route_lookup: */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; - if (max_headroom > READ_ONCE(dev->needed_headroom)) - WRITE_ONCE(dev->needed_headroom, max_headroom); + ip_tunnel_adj_headroom(dev, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) -- cgit v1.2.3 From 0d92808024b4e9868cef68d16f121d509843e80e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 1 Oct 2025 10:55:58 -0400 Subject: Bluetooth: HCI: Fix tracking of advertisement set/instance 0x00 This fixes the state tracking of advertisement set/instance 0x00 which is considered a legacy instance and is not tracked individually by adv_instances list, previously it was assumed that hci_dev itself would track it via HCI_LE_ADV but that is a global state not specifc to instance 0x00, so to fix it a new flag is introduced that only tracks the state of instance 0x00. Fixes: 1488af7b8b5f ("Bluetooth: hci_sync: Fix hci_resume_advertising_sync") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_event.c | 4 ++++ net/bluetooth/hci_sync.c | 5 ++--- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 9ecc70baaca9..8d0e703bc929 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -434,6 +434,7 @@ enum { HCI_USER_CHANNEL, HCI_EXT_CONFIGURED, HCI_LE_ADV, + HCI_LE_ADV_0, HCI_LE_PER_ADV, HCI_LE_SCAN, HCI_SSP_ENABLED, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d790b0d4eb9a..1dabf5a7ae18 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1609,6 +1609,8 @@ static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data, if (adv && !adv->periodic) adv->enabled = true; + else if (!set->handle) + hci_dev_set_flag(hdev, HCI_LE_ADV_0); conn = hci_lookup_le_connect(hdev); if (conn) @@ -1619,6 +1621,8 @@ static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data, if (cp->num_of_sets) { if (adv) adv->enabled = false; + else if (!set->handle) + hci_dev_clear_flag(hdev, HCI_LE_ADV_0); /* If just one instance was disabled check if there are * any other instance enabled before clearing HCI_LE_ADV diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index d160e5e1fe8a..28ad08cd7d70 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2606,9 +2606,8 @@ static int hci_resume_advertising_sync(struct hci_dev *hdev) /* If current advertising instance is set to instance 0x00 * then we need to re-enable it. */ - if (!hdev->cur_adv_instance) - err = hci_enable_ext_advertising_sync(hdev, - hdev->cur_adv_instance); + if (hci_dev_test_and_clear_flag(hdev, HCI_LE_ADV_0)) + err = hci_enable_ext_advertising_sync(hdev, 0x00); } else { /* Schedule for most recent instance to be restarted and begin * the software rotation loop -- cgit v1.2.3 From e8785404de06a69d89dcdd1e9a0b6ea42dc6d327 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Fri, 3 Oct 2025 22:07:32 +0300 Subject: Bluetooth: MGMT: fix crash in set_mesh_sync and set_mesh_complete There is a BUG: KASAN: stack-out-of-bounds in set_mesh_sync due to memcpy from badly declared on-stack flexible array. Another crash is in set_mesh_complete() due to double list_del via mgmt_pending_valid + mgmt_pending_remove. Use DEFINE_FLEX to declare the flexible array right, and don't memcpy outside bounds. As mgmt_pending_valid removes the cmd from list, use mgmt_pending_free, and also report status on error. Fixes: 302a1f674c00d ("Bluetooth: MGMT: Fix possible UAFs") Signed-off-by: Pauli Virtanen Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 2 +- net/bluetooth/mgmt.c | 26 +++++++++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 74edea06985b..bca0333f1e99 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -853,7 +853,7 @@ struct mgmt_cp_set_mesh { __le16 window; __le16 period; __u8 num_ad_types; - __u8 ad_types[]; + __u8 ad_types[] __counted_by(num_ad_types); } __packed; #define MGMT_SET_MESH_RECEIVER_SIZE 6 diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index a3d16eece0d2..24e335e3a727 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2175,19 +2175,24 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) sk = cmd->sk; if (status) { + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, + status); mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true, cmd_status_rsp, &status); - return; + goto done; } - mgmt_pending_remove(cmd); mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, 0, NULL, 0); + +done: + mgmt_pending_free(cmd); } static int set_mesh_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_mesh cp; + DEFINE_FLEX(struct mgmt_cp_set_mesh, cp, ad_types, num_ad_types, + sizeof(hdev->mesh_ad_types)); size_t len; mutex_lock(&hdev->mgmt_pending_lock); @@ -2197,27 +2202,26 @@ static int set_mesh_sync(struct hci_dev *hdev, void *data) return -ECANCELED; } - memcpy(&cp, cmd->param, sizeof(cp)); + len = cmd->param_len; + memcpy(cp, cmd->param, min(__struct_size(cp), len)); mutex_unlock(&hdev->mgmt_pending_lock); - len = cmd->param_len; - memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types)); - if (cp.enable) + if (cp->enable) hci_dev_set_flag(hdev, HCI_MESH); else hci_dev_clear_flag(hdev, HCI_MESH); - hdev->le_scan_interval = __le16_to_cpu(cp.period); - hdev->le_scan_window = __le16_to_cpu(cp.window); + hdev->le_scan_interval = __le16_to_cpu(cp->period); + hdev->le_scan_window = __le16_to_cpu(cp->window); - len -= sizeof(cp); + len -= sizeof(struct mgmt_cp_set_mesh); /* If filters don't fit, forward all adv pkts */ if (len <= sizeof(hdev->mesh_ad_types)) - memcpy(hdev->mesh_ad_types, cp.ad_types, len); + memcpy(hdev->mesh_ad_types, cp->ad_types, len); hci_update_passive_scan_sync(hdev); return 0; -- cgit v1.2.3 From 76e20da0bd00c556ed0a1e7250bdb6ac3e808ea8 Mon Sep 17 00:00:00 2001 From: Frédéric Danis Date: Mon, 6 Oct 2025 10:35:44 +0200 Subject: Revert "Bluetooth: L2CAP: convert timeouts to secs_to_jiffies()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit c9d84da18d1e0d28a7e16ca6df8e6d47570501d4. It replaces in L2CAP calls to msecs_to_jiffies() to secs_to_jiffies() and updates the constants accordingly. But the constants are also used in LCAP Configure Request and L2CAP Configure Response which expect values in milliseconds. This may prevent correct usage of L2CAP channel. To fix it, keep those constants in milliseconds and so revert this change. Fixes: c9d84da18d1e ("Bluetooth: L2CAP: convert timeouts to secs_to_jiffies()") Signed-off-by: Frédéric Danis Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/l2cap.h | 4 ++-- net/bluetooth/l2cap_core.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 4bb0eaedda18..00e182a22720 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -38,8 +38,8 @@ #define L2CAP_DEFAULT_TX_WINDOW 63 #define L2CAP_DEFAULT_EXT_WINDOW 0x3FFF #define L2CAP_DEFAULT_MAX_TX 3 -#define L2CAP_DEFAULT_RETRANS_TO 2 /* seconds */ -#define L2CAP_DEFAULT_MONITOR_TO 12 /* seconds */ +#define L2CAP_DEFAULT_RETRANS_TO 2000 /* 2 seconds */ +#define L2CAP_DEFAULT_MONITOR_TO 12000 /* 12 seconds */ #define L2CAP_DEFAULT_MAX_PDU_SIZE 1492 /* Sized for AMP packet */ #define L2CAP_DEFAULT_ACK_TO 200 #define L2CAP_DEFAULT_MAX_SDU_SIZE 0xFFFF diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 805c752ac0a9..d08320380ad6 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -282,7 +282,7 @@ static void __set_retrans_timer(struct l2cap_chan *chan) if (!delayed_work_pending(&chan->monitor_timer) && chan->retrans_timeout) { l2cap_set_timer(chan, &chan->retrans_timer, - secs_to_jiffies(chan->retrans_timeout)); + msecs_to_jiffies(chan->retrans_timeout)); } } @@ -291,7 +291,7 @@ static void __set_monitor_timer(struct l2cap_chan *chan) __clear_retrans_timer(chan); if (chan->monitor_timeout) { l2cap_set_timer(chan, &chan->monitor_timer, - secs_to_jiffies(chan->monitor_timeout)); + msecs_to_jiffies(chan->monitor_timeout)); } } -- cgit v1.2.3 From 751463ceefc3397566d03c8b64ef4a77f5fd88ac Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 22 Oct 2025 16:03:19 -0400 Subject: Bluetooth: hci_core: Fix tracking of periodic advertisement Periodic advertising enabled flag cannot be tracked by the enabled flag since advertising and periodic advertising each can be enabled/disabled separately from one another causing the states to be inconsistent when for example an advertising set is disabled its enabled flag is set to false which is then used for periodic which has not being disabled. Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_event.c | 7 +++++-- net/bluetooth/hci_sync.c | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2924c2bf2a98..b8100dbfe5d7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -244,6 +244,7 @@ struct adv_info { bool enabled; bool pending; bool periodic; + bool periodic_enabled; __u8 mesh; __u8 instance; __u8 handle; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1dabf5a7ae18..d37db364acf7 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1607,7 +1607,7 @@ static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data, hci_dev_set_flag(hdev, HCI_LE_ADV); - if (adv && !adv->periodic) + if (adv) adv->enabled = true; else if (!set->handle) hci_dev_set_flag(hdev, HCI_LE_ADV_0); @@ -3963,8 +3963,11 @@ static u8 hci_cc_le_set_per_adv_enable(struct hci_dev *hdev, void *data, hci_dev_set_flag(hdev, HCI_LE_PER_ADV); if (adv) - adv->enabled = true; + adv->periodic_enabled = true; } else { + if (adv) + adv->periodic_enabled = false; + /* If just one instance was disabled check if there are * any other instance enabled before clearing HCI_LE_PER_ADV. * The current periodic adv instance will be marked as diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 28ad08cd7d70..73fc41b68b68 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1607,7 +1607,7 @@ int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance) /* If periodic advertising already disabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); - if (!adv || !adv->periodic || !adv->enabled) + if (!adv || !adv->periodic_enabled) return 0; memset(&cp, 0, sizeof(cp)); @@ -1672,7 +1672,7 @@ static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance) /* If periodic advertising already enabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); - if (adv && adv->periodic && adv->enabled) + if (adv && adv->periodic_enabled) return 0; memset(&cp, 0, sizeof(cp)); -- cgit v1.2.3 From 7ceba45a6658ce637da334cd0ebf27f4ede6c0fe Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 28 Oct 2025 12:58:37 +0200 Subject: wifi: cfg80211: add an hrtimer based delayed work item The normal timer mechanism assume that timeout further in the future need a lower accuracy. As an example, the granularity for a timer scheduled 4096 ms in the future on a 1000 Hz system is already 512 ms. This granularity is perfectly sufficient for e.g. timeouts, but there are other types of events that will happen at a future point in time and require a higher accuracy. Add a new wiphy_hrtimer_work type that uses an hrtimer internally. The API is almost identical to the existing wiphy_delayed_work and it can be used as a drop-in replacement after minor adjustments. The work will be scheduled relative to the current time with a slack of 1 millisecond. CC: stable@vger.kernel.org # 6.4+ Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251028125710.7f13a2adc5eb.I01b5af0363869864b0580d9c2a1770bafab69566@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/wireless/core.c | 56 ++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 21 ++++++++++++++ 3 files changed, 155 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 781624f5913a..820e299f06b5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6435,6 +6435,11 @@ static inline void wiphy_delayed_work_init(struct wiphy_delayed_work *dwork, * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can * use just cancel_work() instead of cancel_work_sync(), it requires * being in a section protected by wiphy_lock(). + * + * Note that these are scheduled with a timer where the accuracy + * becomes less the longer in the future the scheduled timer is. Use + * wiphy_hrtimer_work_queue() if the timer must be not be late by more + * than approximately 10 percent. */ void wiphy_delayed_work_queue(struct wiphy *wiphy, struct wiphy_delayed_work *dwork, @@ -6506,6 +6511,79 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy, bool wiphy_delayed_work_pending(struct wiphy *wiphy, struct wiphy_delayed_work *dwork); +struct wiphy_hrtimer_work { + struct wiphy_work work; + struct wiphy *wiphy; + struct hrtimer timer; +}; + +enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t); + +static inline void wiphy_hrtimer_work_init(struct wiphy_hrtimer_work *hrwork, + wiphy_work_func_t func) +{ + hrtimer_setup(&hrwork->timer, wiphy_hrtimer_work_timer, + CLOCK_BOOTTIME, HRTIMER_MODE_REL); + wiphy_work_init(&hrwork->work, func); +} + +/** + * wiphy_hrtimer_work_queue - queue hrtimer work for the wiphy + * @wiphy: the wiphy to queue for + * @hrwork: the high resolution timer worker + * @delay: the delay given as a ktime_t + * + * Please refer to wiphy_delayed_work_queue(). The difference is that + * the hrtimer work uses a high resolution timer for scheduling. This + * may be needed if timeouts might be scheduled further in the future + * and the accuracy of the normal timer is not sufficient. + * + * Expect a delay of a few milliseconds as the timer is scheduled + * with some slack and some more time may pass between queueing the + * work and its start. + */ +void wiphy_hrtimer_work_queue(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork, + ktime_t delay); + +/** + * wiphy_hrtimer_work_cancel - cancel previously queued hrtimer work + * @wiphy: the wiphy, for debug purposes + * @hrtimer: the hrtimer work to cancel + * + * Cancel the work *without* waiting for it, this assumes being + * called under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_hrtimer_work_cancel(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrtimer); + +/** + * wiphy_hrtimer_work_flush - flush previously queued hrtimer work + * @wiphy: the wiphy, for debug purposes + * @hrwork: the hrtimer work to flush + * + * Flush the work (i.e. run it if pending). This must be called + * under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_hrtimer_work_flush(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork); + +/** + * wiphy_hrtimer_work_pending - Find out whether a wiphy hrtimer + * work item is currently pending. + * + * @wiphy: the wiphy, for debug purposes + * @hrwork: the hrtimer work in question + * + * Return: true if timer is pending, false otherwise + * + * Please refer to the wiphy_delayed_work_pending() documentation as + * this is the equivalent function for hrtimer based delayed work + * items. + */ +bool wiphy_hrtimer_work_pending(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork); + /** * enum ieee80211_ap_reg_power - regulatory power for an Access Point * diff --git a/net/wireless/core.c b/net/wireless/core.c index 797f9f2004a6..54a34d8d356e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1787,6 +1787,62 @@ bool wiphy_delayed_work_pending(struct wiphy *wiphy, } EXPORT_SYMBOL_GPL(wiphy_delayed_work_pending); +enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t) +{ + struct wiphy_hrtimer_work *hrwork = + container_of(t, struct wiphy_hrtimer_work, timer); + + wiphy_work_queue(hrwork->wiphy, &hrwork->work); + + return HRTIMER_NORESTART; +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_timer); + +void wiphy_hrtimer_work_queue(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork, + ktime_t delay) +{ + trace_wiphy_hrtimer_work_queue(wiphy, &hrwork->work, delay); + + if (!delay) { + hrtimer_cancel(&hrwork->timer); + wiphy_work_queue(wiphy, &hrwork->work); + return; + } + + hrwork->wiphy = wiphy; + hrtimer_start_range_ns(&hrwork->timer, delay, + 1000 * NSEC_PER_USEC, HRTIMER_MODE_REL); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_queue); + +void wiphy_hrtimer_work_cancel(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + lockdep_assert_held(&wiphy->mtx); + + hrtimer_cancel(&hrwork->timer); + wiphy_work_cancel(wiphy, &hrwork->work); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_cancel); + +void wiphy_hrtimer_work_flush(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + lockdep_assert_held(&wiphy->mtx); + + hrtimer_cancel(&hrwork->timer); + wiphy_work_flush(wiphy, &hrwork->work); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_flush); + +bool wiphy_hrtimer_work_pending(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + return hrtimer_is_queued(&hrwork->timer); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_pending); + static int __init cfg80211_init(void) { int err; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 8a4c34112eb5..2b71f1d867a0 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -304,6 +304,27 @@ TRACE_EVENT(wiphy_delayed_work_queue, __entry->delay) ); +TRACE_EVENT(wiphy_hrtimer_work_queue, + TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work, + ktime_t delay), + TP_ARGS(wiphy, work, delay), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(void *, instance) + __field(void *, func) + __field(ktime_t, delay) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->instance = work; + __entry->func = work->func; + __entry->delay = delay; + ), + TP_printk(WIPHY_PR_FMT " instance=%p func=%pS delay=%llu", + WIPHY_PR_ARG, __entry->instance, __entry->func, + __entry->delay) +); + TRACE_EVENT(wiphy_work_worker_start, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy), -- cgit v1.2.3 From b1e014a1f3275a6f3d0f2b30b8117447fc3915f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Oct 2025 12:58:01 +0100 Subject: tcp: add newval parameter to tcp_rcvbuf_grow() This patch has no functional change, and prepares the following one. tcp_rcvbuf_grow() will need to have access to tp->rcvq_space.space old and new values. Change mptcp_rcvbuf_grow() in a similar way. Signed-off-by: Eric Dumazet [ Moved 'oldval' declaration to the next patch to avoid warnings at build time. ] Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20251028-net-tcp-recv-autotune-v3-3-74b43ba4c84c@kernel.org Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 14 +++++++------- net/mptcp/protocol.c | 20 ++++++++------------ 3 files changed, 16 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5ca230ed526a..ab20f549b8f9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -370,7 +370,7 @@ void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); -void tcp_rcvbuf_grow(struct sock *sk); +void tcp_rcvbuf_grow(struct sock *sk, u32 newval); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 31ea5af49f2d..cb4e07f84ae2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -891,18 +891,20 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, } } -void tcp_rcvbuf_grow(struct sock *sk) +void tcp_rcvbuf_grow(struct sock *sk, u32 newval) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); - int rcvwin, rcvbuf, cap; + u32 rcvwin, rcvbuf, cap; + + tp->rcvq_space.space = newval; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return; /* slow start: allow the sender to double its rate. */ - rcvwin = tp->rcvq_space.space << 1; + rcvwin = newval << 1; if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; @@ -943,9 +945,7 @@ void tcp_rcv_space_adjust(struct sock *sk) trace_tcp_rcvbuf_grow(sk, time); - tp->rcvq_space.space = copied; - - tcp_rcvbuf_grow(sk); + tcp_rcvbuf_grow(sk, copied); new_measure: tp->rcvq_space.seq = tp->copied_seq; @@ -5270,7 +5270,7 @@ end: } /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ if (sk->sk_socket) - tcp_rcvbuf_grow(sk); + tcp_rcvbuf_grow(sk, tp->rcvq_space.space); } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a8a3bdf95543..052a0c62023f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -194,17 +194,18 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, * - mptcp does not maintain a msk-level window clamp * - returns true when the receive buffer is actually updated */ -static bool mptcp_rcvbuf_grow(struct sock *sk) +static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval) { struct mptcp_sock *msk = mptcp_sk(sk); const struct net *net = sock_net(sk); - int rcvwin, rcvbuf, cap; + u32 rcvwin, rcvbuf, cap; + msk->rcvq_space.space = newval; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return false; - rcvwin = msk->rcvq_space.space << 1; + rcvwin = newval << 1; if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; @@ -334,7 +335,7 @@ end: skb_set_owner_r(skb, sk); /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ if (sk->sk_socket) - mptcp_rcvbuf_grow(sk); + mptcp_rcvbuf_grow(sk, msk->rcvq_space.space); } static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, @@ -2049,10 +2050,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; - msk->rcvq_space.space = msk->rcvq_space.copied; - if (mptcp_rcvbuf_grow(sk)) { - int copied = msk->rcvq_space.copied; - + if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) { /* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to * the mptcp rx queue fast enough (announced rcv_win can @@ -2065,10 +2063,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) ssk = mptcp_subflow_tcp_sock(subflow); slow = lock_sock_fast(ssk); /* subflows can be added before tcp_init_transfer() */ - if (tcp_sk(ssk)->rcvq_space.space) { - tcp_sk(ssk)->rcvq_space.space = copied; - tcp_rcvbuf_grow(ssk); - } + if (tcp_sk(ssk)->rcvq_space.space) + tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied); unlock_sock_fast(ssk, slow); } } -- cgit v1.2.3 From 34892cfec0c2d96787c4be7bda0d5f18d7dacf85 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 26 Oct 2025 22:03:01 +0200 Subject: net: tls: Change async resync helpers argument Update tls_offload_rx_resync_async_request_start() and tls_offload_rx_resync_async_request_end() to get a struct tls_offload_resync_async parameter directly, rather than extracting it from struct sock. This change aligns the function signatures with the upcoming tls_offload_rx_resync_async_request_cancel() helper, which will be introduced in a subsequent patch. Signed-off-by: Shahar Shitrit Reviewed-by: Sabrina Dubroca Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1761508983-937977-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c | 9 +++++++-- include/net/tls.h | 21 ++++++++------------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index d7a11ff9bbdb..5fbc92269585 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -425,12 +425,14 @@ void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, { struct mlx5e_ktls_rx_resync_buf *buf = wi->tls_get_params.buf; struct mlx5e_ktls_offload_context_rx *priv_rx; + struct tls_offload_context_rx *rx_ctx; u8 tracker_state, auth_state, *ctx; struct device *dev; u32 hw_seq; priv_rx = buf->priv_rx; dev = mlx5_core_dma_dev(sq->channel->mdev); + rx_ctx = tls_offload_ctx_rx(tls_get_ctx(priv_rx->sk)); if (unlikely(test_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags))) goto out; @@ -447,7 +449,8 @@ void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, } hw_seq = MLX5_GET(tls_progress_params, ctx, hw_resync_tcp_sn); - tls_offload_rx_resync_async_request_end(priv_rx->sk, cpu_to_be32(hw_seq)); + tls_offload_rx_resync_async_request_end(rx_ctx->resync_async, + cpu_to_be32(hw_seq)); priv_rx->rq_stats->tls_resync_req_end++; out: mlx5e_ktls_priv_rx_put(priv_rx); @@ -482,6 +485,7 @@ static bool resync_queue_get_psv(struct sock *sk) static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb) { struct ethhdr *eth = (struct ethhdr *)(skb->data); + struct tls_offload_resync_async *resync_async; struct net_device *netdev = rq->netdev; struct net *net = dev_net(netdev); struct sock *sk = NULL; @@ -527,7 +531,8 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb) seq = th->seq; datalen = skb->len - depth; - tls_offload_rx_resync_async_request_start(sk, seq, datalen); + resync_async = tls_offload_ctx_rx(tls_get_ctx(sk))->resync_async; + tls_offload_rx_resync_async_request_start(resync_async, seq, datalen); rq->stats->tls_resync_req_start++; unref: diff --git a/include/net/tls.h b/include/net/tls.h index 857340338b69..b90f3b675c3c 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -451,25 +451,20 @@ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) /* Log all TLS record header TCP sequences in [seq, seq+len] */ static inline void -tls_offload_rx_resync_async_request_start(struct sock *sk, __be32 seq, u16 len) +tls_offload_rx_resync_async_request_start(struct tls_offload_resync_async *resync_async, + __be32 seq, u16 len) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); - - atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) | + atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) | ((u64)len << 16) | RESYNC_REQ | RESYNC_REQ_ASYNC); - rx_ctx->resync_async->loglen = 0; - rx_ctx->resync_async->rcd_delta = 0; + resync_async->loglen = 0; + resync_async->rcd_delta = 0; } static inline void -tls_offload_rx_resync_async_request_end(struct sock *sk, __be32 seq) +tls_offload_rx_resync_async_request_end(struct tls_offload_resync_async *resync_async, + __be32 seq) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); - - atomic64_set(&rx_ctx->resync_async->req, - ((u64)ntohl(seq) << 32) | RESYNC_REQ); + atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); } static inline void -- cgit v1.2.3 From c15d5c62ab313c19121f10e25d4fec852bd1c40c Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 26 Oct 2025 22:03:02 +0200 Subject: net: tls: Cancel RX async resync request on rcd_delta overflow When a netdev issues a RX async resync request for a TLS connection, the TLS module handles it by logging record headers and attempting to match them to the tcp_sn provided by the device. If a match is found, the TLS module approves the tcp_sn for resynchronization. While waiting for a device response, the TLS module also increments rcd_delta each time a new TLS record is received, tracking the distance from the original resync request. However, if the device response is delayed or fails (e.g due to unstable connection and device getting out of tracking, hardware errors, resource exhaustion etc.), the TLS module keeps logging and incrementing, which can lead to a WARN() when rcd_delta exceeds the threshold. To address this, introduce tls_offload_rx_resync_async_request_cancel() to explicitly cancel resync requests when a device response failure is detected. Call this helper also as a final safeguard when rcd_delta crosses its threshold, as reaching this point implies that earlier cancellation did not occur. Signed-off-by: Shahar Shitrit Reviewed-by: Sabrina Dubroca Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1761508983-937977-3-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/tls.h | 6 ++++++ net/tls/tls_device.c | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index b90f3b675c3c..c7bcdb3afad7 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -467,6 +467,12 @@ tls_offload_rx_resync_async_request_end(struct tls_offload_resync_async *resync_ atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); } +static inline void +tls_offload_rx_resync_async_request_cancel(struct tls_offload_resync_async *resync_async) +{ + atomic64_set(&resync_async->req, 0); +} + static inline void tls_offload_rx_resync_set_type(struct sock *sk, enum tls_offload_sync_type type) { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a64ae15b1a60..71734411ff4c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -723,8 +723,10 @@ tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, /* shouldn't get to wraparound: * too long in async stage, something bad happened */ - if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) + if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) { + tls_offload_rx_resync_async_request_cancel(resync_async); return false; + } /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request -- cgit v1.2.3 From c57f5fee54dfc83ee1d7f70f7beb9410b8466e9e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 25 Oct 2025 21:53:20 +0100 Subject: libeth: xdp: Disable generic kCFI pass for libeth_xdp_tx_xmit_bulk() When building drivers/net/ethernet/intel/idpf/xsk.c for ARCH=arm with CONFIG_CFI=y using a version of LLVM prior to 22.0.0, there is a BUILD_BUG_ON failure: $ cat arch/arm/configs/repro.config CONFIG_BPF_SYSCALL=y CONFIG_CFI=y CONFIG_IDPF=y CONFIG_XDP_SOCKETS=y $ make -skj"$(nproc)" ARCH=arm LLVM=1 clean defconfig repro.config drivers/net/ethernet/intel/idpf/xsk.o In file included from drivers/net/ethernet/intel/idpf/xsk.c:4: include/net/libeth/xsk.h:205:2: error: call to '__compiletime_assert_728' declared with 'error' attribute: BUILD_BUG_ON failed: !__builtin_constant_p(tmo == libeth_xsktmo) 205 | BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo)); | ^ ... libeth_xdp_tx_xmit_bulk() indirectly calls libeth_xsk_xmit_fill_buf() but these functions are marked as __always_inline so that the compiler can turn these indirect calls into direct ones and see that the tmo parameter to __libeth_xsk_xmit_fill_buf_md() is ultimately libeth_xsktmo from idpf_xsk_xmit(). Unfortunately, the generic kCFI pass in LLVM expands the kCFI bundles from the indirect calls in libeth_xdp_tx_xmit_bulk() in such a way that later optimizations cannot turn these calls into direct ones, making the BUILD_BUG_ON fail because it cannot be proved at compile time that tmo is libeth_xsktmo. Disable the generic kCFI pass for libeth_xdp_tx_xmit_bulk() to ensure these indirect calls can always be turned into direct calls to avoid this error. Closes: https://github.com/ClangBuiltLinux/linux/issues/2124 Fixes: 9705d6552f58 ("idpf: implement Rx path for AF_XDP") Signed-off-by: Nathan Chancellor Reviewed-by: Aleksandr Loktionov Acked-by: Alexander Lobakin Link: https://patch.msgid.link/20251025-idpf-fix-arm-kcfi-build-error-v1-3-ec57221153ae@kernel.org Signed-off-by: Kees Cook --- include/net/libeth/xdp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index bc3507edd589..898723ab62e8 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -513,7 +513,7 @@ struct libeth_xdp_tx_desc { * can't fail, but can send less frames if there's no enough free descriptors * available. The actual free space is returned by @prep from the driver. */ -static __always_inline u32 +static __always_inline __nocfi_generic u32 libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq, u32 n, bool unroll, u64 priv, u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), -- cgit v1.2.3 From 61fafbee6cfed283c02a320896089f658fa67e56 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 28 Oct 2025 04:22:48 +0200 Subject: xfrm: Determine inner GSO type from packet inner protocol The GSO segmentation functions for ESP tunnel mode (xfrm4_tunnel_gso_segment and xfrm6_tunnel_gso_segment) were determining the inner packet's L2 protocol type by checking the static x->inner_mode.family field from the xfrm state. This is unreliable. In tunnel mode, the state's actual inner family could be defined by x->inner_mode.family or by x->inner_mode_iaf.family. Checking only the former can lead to a mismatch with the actual packet being processed, causing GSO to create segments with the wrong L2 header type. This patch fixes the bug by deriving the inner mode directly from the packet's inner protocol stored in XFRM_MODE_SKB_CB(skb)->protocol. Instead of replicating the code, this patch modifies the xfrm_ip2inner_mode helper function. It now correctly returns &x->inner_mode if the selector family (x->sel.family) is already specified, thereby handling both specific and AF_UNSPEC cases appropriately. With this change, ESP GSO can use xfrm_ip2inner_mode to get the correct inner mode. It doesn't affect existing callers, as the updated logic now mirrors the checks they were already performing externally. Fixes: 26dbd66eab80 ("esp: choose the correct inner protocol for GSO on inter address family tunnels") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- net/ipv4/esp4_offload.c | 6 ++++-- net/ipv6/esp6_offload.c | 6 ++++-- 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f3014e4f54fc..0a14daaa5dd4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -536,7 +536,8 @@ static inline int xfrm_af2proto(unsigned int family) static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) { - if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || + if ((x->sel.family != AF_UNSPEC) || + (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) return &x->inner_mode; else diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index e0d94270da28..05828d4cb6cd 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -122,8 +122,10 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6) - : htons(ETH_P_IP); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, + XFRM_MODE_SKB_CB(skb)->protocol); + __be16 type = inner_mode->family == AF_INET6 ? htons(ETH_P_IPV6) + : htons(ETH_P_IP); return skb_eth_gso_segment(skb, features, type); } diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 7b41fb4f00b5..22410243ebe8 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -158,8 +158,10 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP) - : htons(ETH_P_IPV6); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, + XFRM_MODE_SKB_CB(skb)->protocol); + __be16 type = inner_mode->family == AF_INET ? htons(ETH_P_IP) + : htons(ETH_P_IPV6); return skb_eth_gso_segment(skb, features, type); } -- cgit v1.2.3 From 8d59fba49362c65332395789fd82771f1028d87e Mon Sep 17 00:00:00 2001 From: Ilia Gavrilov Date: Mon, 20 Oct 2025 15:12:55 +0000 Subject: Bluetooth: MGMT: Fix OOB access in parse_adv_monitor_pattern() In the parse_adv_monitor_pattern() function, the value of the 'length' variable is currently limited to HCI_MAX_EXT_AD_LENGTH(251). The size of the 'value' array in the mgmt_adv_pattern structure is 31. If the value of 'pattern[i].length' is set in the user space and exceeds 31, the 'patterns[i].value' array can be accessed out of bound when copied. Increasing the size of the 'value' array in the 'mgmt_adv_pattern' structure will break the userspace. Considering this, and to avoid OOB access revert the limits for 'offset' and 'length' back to the value of HCI_MAX_AD_LENGTH. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with SVACE. Fixes: db08722fc7d4 ("Bluetooth: hci_core: Fix missing instances using HCI_MAX_AD_LENGTH") Cc: stable@vger.kernel.org Signed-off-by: Ilia Gavrilov Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 2 +- net/bluetooth/mgmt.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index bca0333f1e99..f5be96f08b9d 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -780,7 +780,7 @@ struct mgmt_adv_pattern { __u8 ad_type; __u8 offset; __u8 length; - __u8 value[31]; + __u8 value[HCI_MAX_AD_LENGTH]; } __packed; #define MGMT_OP_ADD_ADV_PATTERNS_MONITOR 0x0052 diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 24e335e3a727..79762bfaea5f 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -5395,9 +5395,9 @@ static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, for (i = 0; i < pattern_count; i++) { offset = patterns[i].offset; length = patterns[i].length; - if (offset >= HCI_MAX_EXT_AD_LENGTH || - length > HCI_MAX_EXT_AD_LENGTH || - (offset + length) > HCI_MAX_EXT_AD_LENGTH) + if (offset >= HCI_MAX_AD_LENGTH || + length > HCI_MAX_AD_LENGTH || + (offset + length) > HCI_MAX_AD_LENGTH) return MGMT_STATUS_INVALID_PARAMS; p = kmalloc(sizeof(*p), GFP_KERNEL); -- cgit v1.2.3 From 485e0626e58768f3c53ba61ab9e09d6b60a455f4 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 6 Nov 2025 13:05:35 -0500 Subject: Bluetooth: hci_event: Fix not handling PA Sync Lost event This handles PA Sync Lost event which previously was assumed to be handled with BIG Sync Lost but their lifetime are not the same thus why there are 2 different events to inform when each sync is lost. Fixes: b2a5f2e1c127 ("Bluetooth: hci_event: Add support for handling LE BIG Sync Lost event") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 5 +++++ net/bluetooth/hci_event.c | 49 ++++++++++++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8d0e703bc929..cb4c02d00759 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2783,6 +2783,11 @@ struct hci_ev_le_per_adv_report { __u8 data[]; } __packed; +#define HCI_EV_LE_PA_SYNC_LOST 0x10 +struct hci_ev_le_pa_sync_lost { + __le16 handle; +} __packed; + #define LE_PA_DATA_COMPLETE 0x00 #define LE_PA_DATA_MORE_TO_COME 0x01 #define LE_PA_DATA_TRUNCATED 0x02 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 03328c1dd090..3838b90343d9 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5843,6 +5843,29 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, void *data, le16_to_cpu(ev->supervision_timeout)); } +static void hci_le_pa_sync_lost_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_ev_le_pa_sync_lost *ev = data; + u16 handle = le16_to_cpu(ev->handle); + struct hci_conn *conn; + + bt_dev_dbg(hdev, "sync handle 0x%4.4x", handle); + + hci_dev_lock(hdev); + + /* Delete the pa sync connection */ + conn = hci_conn_hash_lookup_pa_sync_handle(hdev, handle); + if (conn) { + clear_bit(HCI_CONN_BIG_SYNC, &conn->flags); + clear_bit(HCI_CONN_PA_SYNC, &conn->flags); + hci_disconn_cfm(conn, HCI_ERROR_REMOTE_USER_TERM); + hci_conn_del(conn); + } + + hci_dev_unlock(hdev); +} + static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -7046,29 +7069,24 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_evt_le_big_sync_lost *ev = data; - struct hci_conn *bis, *conn; - bool mgmt_conn; + struct hci_conn *bis; + bool mgmt_conn = false; bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle); hci_dev_lock(hdev); - /* Delete the pa sync connection */ - bis = hci_conn_hash_lookup_pa_sync_big_handle(hdev, ev->handle); - if (bis) { - conn = hci_conn_hash_lookup_pa_sync_handle(hdev, - bis->sync_handle); - if (conn) - hci_conn_del(conn); - } - /* Delete each bis connection */ while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle, BT_CONNECTED, HCI_ROLE_SLAVE))) { - mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &bis->flags); - mgmt_device_disconnected(hdev, &bis->dst, bis->type, bis->dst_type, - ev->reason, mgmt_conn); + if (!mgmt_conn) { + mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, + &bis->flags); + mgmt_device_disconnected(hdev, &bis->dst, bis->type, + bis->dst_type, ev->reason, + mgmt_conn); + } clear_bit(HCI_CONN_BIG_SYNC, &bis->flags); hci_disconn_cfm(bis, ev->reason); @@ -7182,6 +7200,9 @@ static const struct hci_le_ev { hci_le_per_adv_report_evt, sizeof(struct hci_ev_le_per_adv_report), HCI_MAX_EVENT_SIZE), + /* [0x10 = HCI_EV_LE_PA_SYNC_LOST] */ + HCI_LE_EV(HCI_EV_LE_PA_SYNC_LOST, hci_le_pa_sync_lost_evt, + sizeof(struct hci_ev_le_pa_sync_lost)), /* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */ HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt, sizeof(struct hci_evt_le_ext_adv_set_term)), -- cgit v1.2.3 From 4015b979767125cf8a2233a145a3b3af78bfd8fb Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Wed, 12 Nov 2025 15:53:34 +0800 Subject: Bluetooth: btusb: mediatek: Fix kernel crash when releasing mtk iso interface When performing reset tests and encountering abnormal card drop issues that lead to a kernel crash, it is necessary to perform a null check before releasing resources to avoid attempting to release a null pointer. <4>[ 29.158070] Hardware name: Google Quigon sku196612/196613 board (DT) <4>[ 29.158076] Workqueue: hci0 hci_cmd_sync_work [bluetooth] <4>[ 29.158154] pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) <4>[ 29.158162] pc : klist_remove+0x90/0x158 <4>[ 29.158174] lr : klist_remove+0x88/0x158 <4>[ 29.158180] sp : ffffffc0846b3c00 <4>[ 29.158185] pmr_save: 000000e0 <4>[ 29.158188] x29: ffffffc0846b3c30 x28: ffffff80cd31f880 x27: ffffff80c1bdc058 <4>[ 29.158199] x26: dead000000000100 x25: ffffffdbdc624ea3 x24: ffffff80c1bdc4c0 <4>[ 29.158209] x23: ffffffdbdc62a3e6 x22: ffffff80c6c07000 x21: ffffffdbdc829290 <4>[ 29.158219] x20: 0000000000000000 x19: ffffff80cd3e0648 x18: 000000031ec97781 <4>[ 29.158229] x17: ffffff80c1bdc4a8 x16: ffffffdc10576548 x15: ffffff80c1180428 <4>[ 29.158238] x14: 0000000000000000 x13: 000000000000e380 x12: 0000000000000018 <4>[ 29.158248] x11: ffffff80c2a7fd10 x10: 0000000000000000 x9 : 0000000100000000 <4>[ 29.158257] x8 : 0000000000000000 x7 : 7f7f7f7f7f7f7f7f x6 : 2d7223ff6364626d <4>[ 29.158266] x5 : 0000008000000000 x4 : 0000000000000020 x3 : 2e7325006465636e <4>[ 29.158275] x2 : ffffffdc11afeff8 x1 : 0000000000000000 x0 : ffffffdc11be4d0c <4>[ 29.158285] Call trace: <4>[ 29.158290] klist_remove+0x90/0x158 <4>[ 29.158298] device_release_driver_internal+0x20c/0x268 <4>[ 29.158308] device_release_driver+0x1c/0x30 <4>[ 29.158316] usb_driver_release_interface+0x70/0x88 <4>[ 29.158325] btusb_mtk_release_iso_intf+0x68/0xd8 [btusb (HASH:e8b6 5)] <4>[ 29.158347] btusb_mtk_reset+0x5c/0x480 [btusb (HASH:e8b6 5)] <4>[ 29.158361] hci_cmd_sync_work+0x10c/0x188 [bluetooth (HASH:a4fa 6)] <4>[ 29.158430] process_scheduled_works+0x258/0x4e8 <4>[ 29.158441] worker_thread+0x300/0x428 <4>[ 29.158448] kthread+0x108/0x1d0 <4>[ 29.158455] ret_from_fork+0x10/0x20 <0>[ 29.158467] Code: 91343000 940139d1 f9400268 927ff914 (f9401297) <4>[ 29.158474] ---[ end trace 0000000000000000 ]--- <0>[ 29.167129] Kernel panic - not syncing: Oops: Fatal exception <2>[ 29.167144] SMP: stopping secondary CPUs <4>[ 29.167158] ------------[ cut here ]------------ Fixes: ceac1cb0259d ("Bluetooth: btusb: mediatek: add ISO data transmission functions") Signed-off-by: Chris Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 34 +++++++++++++++++++++++++++------- include/net/bluetooth/hci_core.h | 1 - 2 files changed, 27 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index a722446ec73d..202a845e0236 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -2711,9 +2711,16 @@ static int btusb_recv_event_realtek(struct hci_dev *hdev, struct sk_buff *skb) static void btusb_mtk_claim_iso_intf(struct btusb_data *data) { - struct btmtk_data *btmtk_data = hci_get_priv(data->hdev); + struct btmtk_data *btmtk_data; int err; + if (!data->hdev) + return; + + btmtk_data = hci_get_priv(data->hdev); + if (!btmtk_data) + return; + /* * The function usb_driver_claim_interface() is documented to need * locks held if it's not called from a probe routine. The code here @@ -2735,17 +2742,30 @@ static void btusb_mtk_claim_iso_intf(struct btusb_data *data) static void btusb_mtk_release_iso_intf(struct hci_dev *hdev) { - struct btmtk_data *btmtk_data = hci_get_priv(hdev); + struct btmtk_data *btmtk_data; + + if (!hdev) + return; + + btmtk_data = hci_get_priv(hdev); + if (!btmtk_data) + return; if (test_bit(BTMTK_ISOPKT_OVER_INTR, &btmtk_data->flags)) { usb_kill_anchored_urbs(&btmtk_data->isopkt_anchor); clear_bit(BTMTK_ISOPKT_RUNNING, &btmtk_data->flags); - dev_kfree_skb_irq(btmtk_data->isopkt_skb); - btmtk_data->isopkt_skb = NULL; - usb_set_intfdata(btmtk_data->isopkt_intf, NULL); - usb_driver_release_interface(&btusb_driver, - btmtk_data->isopkt_intf); + if (btmtk_data->isopkt_skb) { + dev_kfree_skb_irq(btmtk_data->isopkt_skb); + btmtk_data->isopkt_skb = NULL; + } + + if (btmtk_data->isopkt_intf) { + usb_set_intfdata(btmtk_data->isopkt_intf, NULL); + usb_driver_release_interface(&btusb_driver, + btmtk_data->isopkt_intf); + btmtk_data->isopkt_intf = NULL; + } } clear_bit(BTMTK_ISOPKT_OVER_INTR, &btmtk_data->flags); diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b8100dbfe5d7..32b1c08c8bba 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -749,7 +749,6 @@ struct hci_conn { __u8 remote_cap; __u8 remote_auth; - __u8 remote_id; unsigned int sent; -- cgit v1.2.3 From 79a2d4678ba90bdba577dc3af88cc900d6dcd5ee Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 15 Nov 2025 18:43:55 +0200 Subject: Bluetooth: hci_core: lookup hci_conn on RX path on protocol side The hdev lock/lookup/unlock/use pattern in the packet RX path doesn't ensure hci_conn* is not concurrently modified/deleted. This locking appears to be leftover from before conn_hash started using RCU commit bf4c63252490b ("Bluetooth: convert conn hash to RCU") and not clear if it had purpose since then. Currently, there are code paths that delete hci_conn* from elsewhere than the ordered hdev->workqueue where the RX work runs in. E.g. commit 5af1f84ed13a ("Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync") introduced some of these, and there probably were a few others before it. It's better to do the locking so that even if these run concurrently no UAF is possible. Move the lookup of hci_conn and associated socket-specific conn to protocol recv handlers, and do them within a single critical section to cover hci_conn* usage and lookup. syzkaller has reported a crash that appears to be this issue: [Task hdev->workqueue] [Task 2] hci_disconnect_all_sync l2cap_recv_acldata(hcon) hci_conn_get(hcon) hci_abort_conn_sync(hcon) hci_dev_lock hci_dev_lock hci_conn_del(hcon) v-------------------------------- hci_dev_unlock hci_conn_put(hcon) conn = hcon->l2cap_data (UAF) Fixes: 5af1f84ed13a ("Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync") Reported-by: syzbot+d32d77220b92eddd89ad@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d32d77220b92eddd89ad Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 20 +++++++---- net/bluetooth/hci_core.c | 73 ++++++++++++++-------------------------- net/bluetooth/iso.c | 30 ++++++++++++++--- net/bluetooth/l2cap_core.c | 23 ++++++++++--- net/bluetooth/sco.c | 35 ++++++++++++++----- 5 files changed, 108 insertions(+), 73 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 32b1c08c8bba..0cb87687837f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -856,11 +856,12 @@ extern struct mutex hci_cb_list_lock; /* ----- HCI interface to upper protocols ----- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr); int l2cap_disconn_ind(struct hci_conn *hcon); -void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, + u16 flags); #if IS_ENABLED(CONFIG_BT_BREDR) int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); -void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb); +int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb); #else static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) @@ -868,23 +869,30 @@ static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, return 0; } -static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +static inline int sco_recv_scodata(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb) { + kfree_skb(skb); + return -ENOENT; } #endif #if IS_ENABLED(CONFIG_BT_LE) int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); -void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, + u16 flags); #else static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { return 0; } -static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, - u16 flags) + +static inline int iso_recv(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb, u16 flags) { + kfree_skb(skb); + return -ENOENT; } #endif diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 1920e3d62bda..8ccec73dce45 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3832,13 +3832,14 @@ static void hci_tx_work(struct work_struct *work) static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_acl_hdr *hdr; - struct hci_conn *conn; __u16 handle, flags; + int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ACL packet too small"); - goto drop; + kfree_skb(skb); + return; } handle = __le16_to_cpu(hdr->handle); @@ -3850,36 +3851,27 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) hdev->stat.acl_rx++; - hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_handle(hdev, handle); - hci_dev_unlock(hdev); - - if (conn) { - hci_conn_enter_active_mode(conn, BT_POWER_FORCE_ACTIVE_OFF); - - /* Send to upper protocol */ - l2cap_recv_acldata(conn, skb, flags); - return; - } else { + err = l2cap_recv_acldata(hdev, handle, skb, flags); + if (err == -ENOENT) bt_dev_err(hdev, "ACL packet for unknown connection handle %d", handle); - } - -drop: - kfree_skb(skb); + else if (err) + bt_dev_dbg(hdev, "ACL packet recv for handle %d failed: %d", + handle, err); } /* SCO data packet */ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_sco_hdr *hdr; - struct hci_conn *conn; __u16 handle, flags; + int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "SCO packet too small"); - goto drop; + kfree_skb(skb); + return; } handle = __le16_to_cpu(hdr->handle); @@ -3891,34 +3883,28 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) hdev->stat.sco_rx++; - hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_handle(hdev, handle); - hci_dev_unlock(hdev); + hci_skb_pkt_status(skb) = flags & 0x03; - if (conn) { - /* Send to upper protocol */ - hci_skb_pkt_status(skb) = flags & 0x03; - sco_recv_scodata(conn, skb); - return; - } else { + err = sco_recv_scodata(hdev, handle, skb); + if (err == -ENOENT) bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d", handle); - } - -drop: - kfree_skb(skb); + else if (err) + bt_dev_dbg(hdev, "SCO packet recv for handle %d failed: %d", + handle, err); } static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_iso_hdr *hdr; - struct hci_conn *conn; __u16 handle, flags; + int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ISO packet too small"); - goto drop; + kfree_skb(skb); + return; } handle = __le16_to_cpu(hdr->handle); @@ -3928,22 +3914,13 @@ static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); - hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_handle(hdev, handle); - hci_dev_unlock(hdev); - - if (!conn) { + err = iso_recv(hdev, handle, skb, flags); + if (err == -ENOENT) bt_dev_err(hdev, "ISO packet for unknown connection handle %d", handle); - goto drop; - } - - /* Send to upper protocol */ - iso_recv(conn, skb, flags); - return; - -drop: - kfree_skb(skb); + else if (err) + bt_dev_dbg(hdev, "ISO packet recv for handle %d failed: %d", + handle, err); } static bool hci_req_is_complete(struct hci_dev *hdev) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 3d98cb6291da..616c2fef91d2 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2314,14 +2314,31 @@ static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) iso_conn_del(hcon, bt_to_errno(reason)); } -void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags) { - struct iso_conn *conn = hcon->iso_data; + struct hci_conn *hcon; + struct iso_conn *conn; struct skb_shared_hwtstamps *hwts; __u16 pb, ts, len, sn; - if (!conn) - goto drop; + hci_dev_lock(hdev); + + hcon = hci_conn_hash_lookup_handle(hdev, handle); + if (!hcon) { + hci_dev_unlock(hdev); + kfree_skb(skb); + return -ENOENT; + } + + conn = iso_conn_hold_unless_zero(hcon->iso_data); + hcon = NULL; + + hci_dev_unlock(hdev); + + if (!conn) { + kfree_skb(skb); + return -EINVAL; + } pb = hci_iso_flags_pb(flags); ts = hci_iso_flags_ts(flags); @@ -2377,7 +2394,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) hci_skb_pkt_status(skb) = flags & 0x03; hci_skb_pkt_seqnum(skb) = sn; iso_recv_frame(conn, skb); - return; + goto done; } if (pb == ISO_SINGLE) { @@ -2455,6 +2472,9 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) drop: kfree_skb(skb); +done: + iso_conn_put(conn); + return 0; } static struct hci_cb iso_cb = { diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 35c57657bcf4..07b493331fd7 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7510,13 +7510,24 @@ struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) return c; } -void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb, u16 flags) { + struct hci_conn *hcon; struct l2cap_conn *conn; int len; - /* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */ - hci_dev_lock(hcon->hdev); + /* Lock hdev for hci_conn, and race on l2cap_data vs. l2cap_conn_del */ + hci_dev_lock(hdev); + + hcon = hci_conn_hash_lookup_handle(hdev, handle); + if (!hcon) { + hci_dev_unlock(hdev); + kfree_skb(skb); + return -ENOENT; + } + + hci_conn_enter_active_mode(hcon, BT_POWER_FORCE_ACTIVE_OFF); conn = hcon->l2cap_data; @@ -7524,12 +7535,13 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) conn = l2cap_conn_add(hcon); conn = l2cap_conn_hold_unless_zero(conn); + hcon = NULL; - hci_dev_unlock(hcon->hdev); + hci_dev_unlock(hdev); if (!conn) { kfree_skb(skb); - return; + return -EINVAL; } BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags); @@ -7643,6 +7655,7 @@ drop: unlock: mutex_unlock(&conn->lock); l2cap_conn_put(conn); + return 0; } static struct hci_cb l2cap_cb = { diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index ab0cf442d57b..298c2a9ab4df 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1458,22 +1458,39 @@ static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) sco_conn_del(hcon, bt_to_errno(reason)); } -void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb) { - struct sco_conn *conn = hcon->sco_data; + struct hci_conn *hcon; + struct sco_conn *conn; - if (!conn) - goto drop; + hci_dev_lock(hdev); + + hcon = hci_conn_hash_lookup_handle(hdev, handle); + if (!hcon) { + hci_dev_unlock(hdev); + kfree_skb(skb); + return -ENOENT; + } + + conn = sco_conn_hold_unless_zero(hcon->sco_data); + hcon = NULL; + + hci_dev_unlock(hdev); + + if (!conn) { + kfree_skb(skb); + return -EINVAL; + } BT_DBG("conn %p len %u", conn, skb->len); - if (skb->len) { + if (skb->len) sco_recv_frame(conn, skb); - return; - } + else + kfree_skb(skb); -drop: - kfree_skb(skb); + sco_conn_put(conn); + return 0; } static struct hci_cb sco_cb = { -- cgit v1.2.3 From 4fe5a00ec70717a7f1002d8913ec6143582b3c8e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 15:41:00 +0000 Subject: net: sched: fix TCF_LAYER_TRANSPORT handling in tcf_get_base_ptr() syzbot reported that tcf_get_base_ptr() can be called while transport header is not set [1]. Instead of returning a dangling pointer, return NULL. Fix tcf_get_base_ptr() callers to handle this NULL value. [1] WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 skb_transport_header include/linux/skbuff.h:3071 [inline] WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 tcf_get_base_ptr include/net/pkt_cls.h:539 [inline] WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 em_nbyte_match+0x2d8/0x3f0 net/sched/em_nbyte.c:43 Modules linked in: CPU: 1 UID: 0 PID: 6019 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Call Trace: tcf_em_match net/sched/ematch.c:494 [inline] __tcf_em_tree_match+0x1ac/0x770 net/sched/ematch.c:520 tcf_em_tree_match include/net/pkt_cls.h:512 [inline] basic_classify+0x115/0x2d0 net/sched/cls_basic.c:50 tc_classify include/net/tc_wrapper.h:197 [inline] __tcf_classify net/sched/cls_api.c:1764 [inline] tcf_classify+0x4cf/0x1140 net/sched/cls_api.c:1860 multiq_classify net/sched/sch_multiq.c:39 [inline] multiq_enqueue+0xfd/0x4c0 net/sched/sch_multiq.c:66 dev_qdisc_enqueue+0x4e/0x260 net/core/dev.c:4118 __dev_xmit_skb net/core/dev.c:4214 [inline] __dev_queue_xmit+0xe83/0x3b50 net/core/dev.c:4729 packet_snd net/packet/af_packet.c:3076 [inline] packet_sendmsg+0x3e33/0x5080 net/packet/af_packet.c:3108 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg+0x21c/0x270 net/socket.c:742 ____sys_sendmsg+0x505/0x830 net/socket.c:2630 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+f3a497f02c389d86ef16@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6920855a.a70a0220.2ea503.0058.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20251121154100.1616228-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/pkt_cls.h | 2 ++ net/sched/em_cmp.c | 5 ++++- net/sched/em_nbyte.c | 2 ++ net/sched/em_text.c | 11 +++++++++-- 4 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c64fd896b1f9..99ac747b7906 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -536,6 +536,8 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: + if (!skb_transport_header_was_set(skb)) + break; return skb_transport_header(skb); } diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index 64b637f18bc7..48c1bce74f49 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -22,9 +22,12 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data; - unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off; + unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer); u32 val = 0; + if (!ptr) + return 0; + ptr += cmp->off; if (!tcf_valid_offset(skb, ptr, cmp->align)) return 0; diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index 4f9f21a05d5e..c65ffa5fff94 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -42,6 +42,8 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em, struct nbyte_data *nbyte = (struct nbyte_data *) em->data; unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer); + if (!ptr) + return 0; ptr += nbyte->hdr.off; if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len)) diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 6b3d0af72c39..692e2be1793e 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -29,12 +29,19 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, struct tcf_pkt_info *info) { struct text_match *tm = EM_TEXT_PRIV(m); + unsigned char *ptr; int from, to; - from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data; + ptr = tcf_get_base_ptr(skb, tm->from_layer); + if (!ptr) + return 0; + from = ptr - skb->data; from += tm->from_offset; - to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data; + ptr = tcf_get_base_ptr(skb, tm->to_layer); + if (!ptr) + return 0; + to = ptr - skb->data; to += tm->to_offset; return skb_find_text(skb, from, to, tm->config) != UINT_MAX; -- cgit v1.2.3