From faeefc173be40512341b102cf1568aa0b6571acd Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Thu, 10 Apr 2025 09:01:27 +0800 Subject: sock: Correct error checking condition for (assign|release)_proto_idx() (assign|release)_proto_idx() wrongly check find_first_zero_bit() failure by condition '(prot->inuse_idx == PROTO_INUSE_NR - 1)' obviously. Fix by correcting the condition to '(prot->inuse_idx == PROTO_INUSE_NR)' Signed-off-by: Zijun Hu Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250410-fix_net-v2-1-d69e7c5739a4@quicinc.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index e54449c9ab0b..121f64011288 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4004,7 +4004,7 @@ static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); - if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } @@ -4015,7 +4015,7 @@ static int assign_proto_idx(struct proto *prot) static void release_proto_idx(struct proto *prot) { - if (prot->inuse_idx != PROTO_INUSE_NR - 1) + if (prot->inuse_idx != PROTO_INUSE_NR) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else -- cgit v1.2.3 From 22d6c9eebf2e68e6ab831ded37daaa83daff6bb8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:46 -0700 Subject: net: Unexport shared functions for DCCP. DCCP was removed, so many inet functions no longer need to be exported. Let's unexport or use EXPORT_IPV6_MOD() for such functions. sk_free_unlock_clone() is inlined in sk_clone_lock() as it's the only caller. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250410023921.11307-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 - net/core/sock.c | 32 ++++++++++++++------------------ net/ipv4/inet_connection_sock.c | 12 +----------- net/ipv4/inet_hashtables.c | 16 +++++----------- net/ipv4/inet_timewait_sock.c | 4 ---- net/ipv6/af_inet6.c | 1 - net/ipv6/inet6_connection_sock.c | 2 -- 7 files changed, 20 insertions(+), 48 deletions(-) (limited to 'net/core/sock.c') diff --git a/include/net/sock.h b/include/net/sock.h index 694f954258d4..bb4d6189292f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1781,7 +1781,6 @@ void sk_free(struct sock *sk); void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); -void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); diff --git a/net/core/sock.c b/net/core/sock.c index 121f64011288..b64df2463300 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2494,17 +2494,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; + + goto free; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - if (bpf_sk_storage_clone(sk, newsk)) { - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } + if (bpf_sk_storage_clone(sk, newsk)) + goto free; /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. @@ -2534,18 +2531,17 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) net_enable_timestamp(); out: return newsk; -} -EXPORT_SYMBOL_GPL(sk_clone_lock); - -void sk_free_unlock_clone(struct sock *sk) -{ +free: /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - sk->sk_destruct = NULL; - bh_unlock_sock(sk); - sk_free(sk); + * destructor and make plain sk_free() + */ + newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; } -EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 9a20b84dc022..a195203e7eef 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -767,7 +767,6 @@ void inet_csk_init_xmit_timers(struct sock *sk, timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } -EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) { @@ -780,7 +779,6 @@ void inet_csk_clear_xmit_timers(struct sock *sk) sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } -EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_clear_xmit_timers_sync(struct sock *sk) { @@ -831,7 +829,6 @@ no_route: __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } -EXPORT_SYMBOL_GPL(inet_csk_route_req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, @@ -898,7 +895,6 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) req->num_retrans++; return err; } -EXPORT_SYMBOL(inet_rtx_syn_ack); static struct request_sock * reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, @@ -1058,14 +1054,13 @@ bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { return __inet_csk_reqsk_queue_drop(sk, req, false); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); +EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { @@ -1209,7 +1204,6 @@ bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, inet_csk_reqsk_queue_added(sk); return true; } -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) @@ -1290,7 +1284,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, return newsk; } -EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this @@ -1380,7 +1373,6 @@ int inet_csk_listen_start(struct sock *sk) inet_sk_set_state(sk, TCP_CLOSE); return err; } -EXPORT_SYMBOL_GPL(inet_csk_listen_start); static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) @@ -1475,7 +1467,6 @@ child_put: sock_put(child); return NULL; } -EXPORT_SYMBOL(inet_csk_complete_hashdance); /* * This routine closes sockets which have been at least partially @@ -1590,4 +1581,3 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) out: return dst; } -EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 5bf163f756e9..d1960701a3b4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -713,7 +713,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) } return ok; } -EXPORT_SYMBOL_GPL(inet_ehash_nolisten); +EXPORT_IPV6_MOD(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) @@ -771,7 +771,7 @@ unlock: return err; } -EXPORT_SYMBOL(__inet_hash); +EXPORT_IPV6_MOD(__inet_hash); int inet_hash(struct sock *sk) { @@ -782,7 +782,6 @@ int inet_hash(struct sock *sk) return err; } -EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { @@ -823,7 +822,7 @@ void inet_unhash(struct sock *sk) spin_unlock_bh(lock); } } -EXPORT_SYMBOL_GPL(inet_unhash); +EXPORT_IPV6_MOD(inet_unhash); static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, @@ -982,14 +981,14 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { return __inet_bhash2_update_saddr(sk, saddr, family, false); } -EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); +EXPORT_IPV6_MOD(inet_bhash2_update_saddr); void inet_bhash2_reset_saddr(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) __inet_bhash2_update_saddr(sk, NULL, 0, true); } -EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); +EXPORT_IPV6_MOD(inet_bhash2_reset_saddr); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') @@ -1214,7 +1213,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet_check_established); } -EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { @@ -1265,7 +1263,6 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h) init_hashinfo_lhash2(h); return 0; } -EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { @@ -1305,7 +1302,6 @@ set_mask: hashinfo->ehash_locks_mask = nblocks - 1; return 0; } -EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries) @@ -1341,7 +1337,6 @@ free_hashinfo: err: return NULL; } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) { @@ -1352,4 +1347,3 @@ void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) vfree(hashinfo->ehash); kfree(hashinfo); } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index aded4bf1bc16..67efe9501581 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -166,7 +166,6 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, spin_unlock(lock); local_bh_enable(); } -EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { @@ -223,7 +222,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, return tw; } -EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* These are always called from BH context. See callers in * tcp_input.c to verify this. @@ -306,7 +304,6 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo) @@ -365,4 +362,3 @@ restart: rcu_read_unlock(); } } -EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f60ec8b0f8ea..85bf681d427b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -881,7 +881,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, } return false; } -EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index dbcf556a35bb..8f500eaf33cf 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -54,7 +54,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, return dst; } -EXPORT_SYMBOL(inet6_csk_route_req); static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) @@ -137,4 +136,3 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; } -EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu); -- cgit v1.2.3 From bd61848900bff597764238f3a8ec67c815cd316e Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:24 +0000 Subject: net: devmem: Implement TX path Augment dmabuf binding to be able to handle TX. Additional to all the RX binding, we also create tx_vec needed for the TX path. Provide API for sendmsg to be able to send dmabufs bound to this device: - Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from. - MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf. Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY implementation, while disabling instances where MSG_ZEROCOPY falls back to copying. We additionally pipe the binding down to the new zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems instead of the traditional page netmems. We also special case skb_frag_dma_map to return the dma-address of these dmabuf net_iovs instead of attempting to map pages. The TX path may release the dmabuf in a context where we cannot wait. This happens when the user unbinds a TX dmabuf while there are still references to its netmems in the TX path. In that case, the netmems will be put_netmem'd from a context where we can't unmap the dmabuf, Resolve this by making __net_devmem_dmabuf_binding_free schedule_work'd. Based on work by Stanislav Fomichev . A lot of the meat of the implementation came from devmem TCP RFC v1[1], which included the TX path, but Stan did all the rebasing on top of netmem/net_iov. Cc: Stanislav Fomichev Signed-off-by: Kaiyuan Zhang Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508004830.4100853-5-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 17 +++-- include/net/sock.h | 1 + io_uring/zcrx.c | 2 +- net/core/datagram.c | 48 ++++++++++++- net/core/devmem.c | 118 +++++++++++++++++++++++++++----- net/core/devmem.h | 61 ++++++++++++++--- net/core/netdev-genl.c | 71 ++++++++++++++++++- net/core/skbuff.c | 18 +++-- net/core/sock.c | 5 ++ net/ipv4/ip_output.c | 3 +- net/ipv4/tcp.c | 48 ++++++++++--- net/ipv6/ip6_output.c | 3 +- net/vmw_vsock/virtio_transport_common.c | 5 +- 13 files changed, 340 insertions(+), 60 deletions(-) (limited to 'net/core/sock.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f3e72be6f634..c7397b17bb08 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1707,13 +1707,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg); + struct ubuf_info *uarg, bool devmem); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); +struct net_devmem_dmabuf_binding; + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length); + size_t length, + struct net_devmem_dmabuf_binding *binding); int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length); @@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { - return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); + return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len, + NULL); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg); + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) @@ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev, size_t offset, size_t size, enum dma_data_direction dir) { + if (skb_frag_is_net_iov(frag)) { + return netmem_to_net_iov(frag->netmem)->dma_addr + offset + + frag->offset; + } return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } diff --git a/include/net/sock.h b/include/net/sock.h index f0fabb9fd28a..3e15d7105ad2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1851,6 +1851,7 @@ struct sockcm_cookie { u32 tsflags; u32 ts_opt_id; u32 priority; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index a07ad38935c8..0c64129f5d50 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -810,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, return io_zcrx_copy_frag(req, ifq, frag, off, len); niov = netmem_to_net_iov(frag->netmem); - if (niov->pp->mp_ops != &io_uring_pp_zc_ops || + if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(niov->pp) != ifq) return -EFAULT; diff --git a/net/core/datagram.c b/net/core/datagram.c index f0634f0cb834..9ef5442536f5 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -63,6 +63,8 @@ #include #include +#include "devmem.h" + /* * Is a socket 'connection oriented' ? */ @@ -691,9 +693,49 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, return 0; } +static int +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, + int length, + struct net_devmem_dmabuf_binding *binding) +{ + int i = skb_shinfo(skb)->nr_frags; + size_t virt_addr, size, off; + struct net_iov *niov; + + /* Devmem filling works by taking an IOVEC from the user where the + * iov_addrs are interpreted as an offset in bytes into the dma-buf to + * send from. We do not support other iter types. + */ + if (iov_iter_type(from) != ITER_IOVEC) + return -EFAULT; + + while (length && iov_iter_count(from)) { + if (i == MAX_SKB_FRAGS) + return -EMSGSIZE; + + virt_addr = (size_t)iter_iov_addr(from); + niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); + if (!niov) + return -EFAULT; + + size = min_t(size_t, size, length); + size = min_t(size_t, size, iter_iov_len(from)); + + get_netmem(net_iov_to_netmem(niov)); + skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, + size, PAGE_SIZE); + iov_iter_advance(from, size); + length -= size; + i++; + } + + return 0; +} + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length) + size_t length, + struct net_devmem_dmabuf_binding *binding) { unsigned long orig_size = skb->truesize; unsigned long truesize; @@ -701,6 +743,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, if (msg && msg->msg_ubuf && msg->sg_from_iter) ret = msg->sg_from_iter(skb, from, length); + else if (binding) + ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); else ret = zerocopy_fill_skb_from_iter(skb, from, length); @@ -734,7 +778,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); diff --git a/net/core/devmem.c b/net/core/devmem.c index dca2ff7cf692..88065e5ef651 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "devmem.h" @@ -52,8 +53,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { + struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); + size_t size, avail; gen_pool_for_each_chunk(binding->chunk_pool, @@ -71,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); + kvfree(binding->tx_vec); kfree(binding); } +EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free); struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -117,6 +122,13 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) unsigned long xa_idx; unsigned int rxq_idx; + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the + * erase. + */ + synchronize_net(); + if (binding->list.next) list_del(&binding->list); @@ -131,8 +143,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } - xa_erase(&net_devmem_dmabuf_bindings, binding->id); - net_devmem_dmabuf_binding_put(binding); } @@ -166,8 +176,9 @@ err_close_rxq: } struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, - struct netlink_ext_ack *extack) +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack) { struct net_devmem_dmabuf_binding *binding; static u32 id_alloc_next; @@ -189,13 +200,6 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, } binding->dev = dev; - - err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, - binding, xa_limit_32b, &id_alloc_next, - GFP_KERNEL); - if (err < 0) - goto err_free_binding; - xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); refcount_set(&binding->ref, 1); @@ -206,26 +210,36 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); - goto err_free_id; + goto err_free_binding; } binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, - DMA_FROM_DEVICE); + direction); if (IS_ERR(binding->sgt)) { err = PTR_ERR(binding->sgt); NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); goto err_detach; } + if (direction == DMA_TO_DEVICE) { + binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->tx_vec) { + err = -ENOMEM; + goto err_unmap; + } + } + /* For simplicity we expect to make PAGE_SIZE allocations, but the * binding can be much more flexible than that. We may be able to * allocate MTU sized chunks here. Leave that for future work... */ - binding->chunk_pool = - gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); if (!binding->chunk_pool) { err = -ENOMEM; - goto err_unmap; + goto err_tx_vec; } virtual = 0; @@ -270,24 +284,32 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); + if (direction == DMA_TO_DEVICE) + binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; } virtual += len; } + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, + binding, xa_limit_32b, &id_alloc_next, + GFP_KERNEL); + if (err < 0) + goto err_free_chunks; + return binding; err_free_chunks: gen_pool_for_each_chunk(binding->chunk_pool, net_devmem_dmabuf_free_chunk_owner, NULL); gen_pool_destroy(binding->chunk_pool); +err_tx_vec: + kvfree(binding->tx_vec); err_unmap: dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, DMA_FROM_DEVICE); err_detach: dma_buf_detach(dmabuf, binding->attachment); -err_free_id: - xa_erase(&net_devmem_dmabuf_bindings, binding->id); err_free_binding: kfree(binding); err_put_dmabuf: @@ -295,6 +317,21 @@ err_put_dmabuf: return ERR_PTR(err); } +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + struct net_devmem_dmabuf_binding *binding; + + rcu_read_lock(); + binding = xa_load(&net_devmem_dmabuf_bindings, id); + if (binding) { + if (!net_devmem_dmabuf_binding_get(binding)) + binding = NULL; + } + rcu_read_unlock(); + + return binding; +} + void net_devmem_get_net_iov(struct net_iov *niov) { net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); @@ -305,6 +342,49 @@ void net_devmem_put_net_iov(struct net_iov *niov) net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); } +struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, + unsigned int dmabuf_id) +{ + struct net_devmem_dmabuf_binding *binding; + struct dst_entry *dst = __sk_dst_get(sk); + int err = 0; + + binding = net_devmem_lookup_dmabuf(dmabuf_id); + if (!binding || !binding->tx_vec) { + err = -EINVAL; + goto out_err; + } + + /* The dma-addrs in this binding are only reachable to the corresponding + * net_device. + */ + if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { + err = -ENODEV; + goto out_err; + } + + return binding; + +out_err: + if (binding) + net_devmem_dmabuf_binding_put(binding); + + return ERR_PTR(err); +} + +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, + size_t virt_addr, size_t *off, size_t *size) +{ + if (virt_addr >= binding->dmabuf->size) + return NULL; + + *off = virt_addr % PAGE_SIZE; + *size = PAGE_SIZE - *off; + + return binding->tx_vec[virt_addr / PAGE_SIZE]; +} + /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) diff --git a/net/core/devmem.h b/net/core/devmem.h index 946f2e015746..67168aae5e5b 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -23,8 +23,9 @@ struct net_devmem_dmabuf_binding { /* The user holds a ref (via the netlink API) for as long as they want * the binding to remain alive. Each page pool using this binding holds - * a ref to keep the binding alive. Each allocated net_iov holds a - * ref. + * a ref to keep the binding alive. The page_pool does not release the + * ref until all the net_iovs allocated from this binding are released + * back to the page_pool. * * The binding undos itself and unmaps the underlying dmabuf once all * those refs are dropped and the binding is no longer desired or in @@ -32,7 +33,10 @@ struct net_devmem_dmabuf_binding { * * net_devmem_get_net_iov() on dmabuf net_iovs will increment this * reference, making sure that the binding remains alive until all the - * net_iovs are no longer used. + * net_iovs are no longer used. net_iovs allocated from this binding + * that are stuck in the TX path for any reason (such as awaiting + * retransmits) hold a reference to the binding until the skb holding + * them is freed. */ refcount_t ref; @@ -48,6 +52,14 @@ struct net_devmem_dmabuf_binding { * active. */ u32 id; + + /* Array of net_iov pointers for this binding, sorted by virtual + * address. This array is convenient to map the virtual addresses to + * net_iovs in the TX path. + */ + struct net_iov **tx_vec; + + struct work_struct unbind_w; }; #if defined(CONFIG_NET_DEVMEM) @@ -64,14 +76,17 @@ struct dmabuf_genpool_chunk_owner { dma_addr_t base_dma_addr; }; -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); +void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, - struct netlink_ext_ack *extack); +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack); +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); +void net_devmem_bind_tx_release(struct sock *sk); static inline struct dmabuf_genpool_chunk_owner * net_devmem_iov_to_chunk_owner(const struct net_iov *niov) @@ -100,10 +115,10 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); } -static inline void +static inline bool net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { - refcount_inc(&binding->ref); + return refcount_inc_not_zero(&binding->ref); } static inline void @@ -112,7 +127,8 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) if (!refcount_dec_and_test(&binding->ref)) return; - __net_devmem_dmabuf_binding_free(binding); + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); } void net_devmem_get_net_iov(struct net_iov *niov); @@ -123,6 +139,11 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); bool net_is_devmem_iov(struct net_iov *niov); +struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size); #else struct net_devmem_dmabuf_binding; @@ -140,18 +161,23 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) { } -static inline void -__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { } static inline struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, + enum dma_data_direction direction, struct netlink_ext_ack *extack) { return ERR_PTR(-EOPNOTSUPP); } +static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + return NULL; +} + static inline void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) { @@ -190,6 +216,19 @@ static inline bool net_is_devmem_iov(struct net_iov *niov) { return false; } + +static inline struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size) +{ + return NULL; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 3beef87235d8..93ea57d96931 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -907,7 +907,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); + binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, + info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; @@ -968,10 +969,74 @@ err_genlmsg_free: return err; } -/* stub */ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) { - return 0; + struct net_devmem_dmabuf_binding *binding; + struct netdev_nl_sock *priv; + struct net_device *netdev; + u32 ifindex, dmabuf_fd; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + + if (!netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock_netdev; + } + + binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, + info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock_netdev; + } + + list_add(&binding->list, &priv->bindings); + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + netdev_unlock(netdev); + mutex_unlock(&priv->lock); + + return genlmsg_reply(rsp, info); + +err_unlock_netdev: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; } void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 00c22bce98e4..4159107f1666 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1655,7 +1655,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, + bool devmem) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; @@ -1670,7 +1671,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) uarg = (void *)skb->cb; uarg->mmp.user = NULL; - if (mm_account_pinned_pages(&uarg->mmp, size)) { + if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } @@ -1693,7 +1694,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) + struct ubuf_info *uarg, bool devmem) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; @@ -1723,7 +1724,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { - if (mm_account_pinned_pages(&uarg_zc->mmp, size)) + if (likely(!devmem) && + mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; @@ -1738,7 +1740,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } new_alloc: - return msg_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size, devmem); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); @@ -1842,7 +1844,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg) + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding) { int err, orig_len = skb->len; @@ -1861,7 +1864,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return -EEXIST; } - err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, + binding); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; diff --git a/net/core/sock.c b/net/core/sock.c index b64df2463300..347ce75482f5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3018,6 +3018,11 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, return -EPERM; sockc->priority = *(u32 *)CMSG_DATA(cmsg); break; + case SCM_DEVMEM_DMABUF: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e18d7ec5062..a2705d454fd6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1014,7 +1014,8 @@ static int __ip_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 86c427f16636..0ae265d39184 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1059,6 +1059,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { + struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; @@ -1066,11 +1067,23 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; + bool sockc_valid = true; int zc = 0; long timeo; flags = msg->msg_flags; + sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) }; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + /* Don't return error until MSG_FASTOPEN has been + * processed; that may succeed even if the cmsg is + * invalid. + */ + sockc_valid = false; + } + if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; @@ -1078,7 +1091,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); - uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), + sockc_valid && !!sockc.dmabuf_id); if (!uarg) { err = -ENOBUFS; goto out_err; @@ -1087,12 +1101,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; + + if (sockc_valid && sockc.dmabuf_id) { + binding = net_devmem_get_binding(sk, sockc.dmabuf_id); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + binding = NULL; + goto out_err; + } + } } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } + if (sockc_valid && sockc.dmabuf_id && + (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { + err = -EINVAL; + goto out_err; + } + if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { @@ -1131,13 +1160,10 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)}; - if (msg->msg_controllen) { - err = sock_cmsg_send(sk, msg, &sockc); - if (unlikely(err)) { + if (!sockc_valid) { + if (!err) err = -EINVAL; - goto out_err; - } + goto out_err; } /* This should be in poll */ @@ -1258,7 +1284,8 @@ new_segment: goto wait_for_space; } - err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, + binding); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; @@ -1339,6 +1366,8 @@ out_nopush: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); + if (binding) + net_devmem_dmabuf_binding_put(binding); return copied + copied_syn; do_error: @@ -1356,6 +1385,9 @@ out_err: sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } + if (binding) + net_devmem_dmabuf_binding_put(binding); + return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ef052ccd9380..7bd29a9ff0db 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1524,7 +1524,8 @@ emsgsize: uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d88096..6e7b727c781c 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -87,7 +87,7 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, uarg = msg_zerocopy_realloc(sk_vsock(vsk), iter->count, - NULL); + NULL, false); if (!uarg) return -1; @@ -107,8 +107,7 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, { if (zcopy) return __zerocopy_sg_from_iter(info->msg, NULL, skb, - &info->msg->msg_iter, - len); + &info->msg->msg_iter, len, NULL); return memcpy_from_msg(skb_put(skb, len), info->msg, len); } -- cgit v1.2.3 From ae4f2f59e1f9c7c9cab1641a3c9645e587f0bc72 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:55 -0700 Subject: tcp: Restrict SO_TXREHASH to TCP socket. sk->sk_txrehash is only used for TCP. Let's restrict SO_TXREHASH to TCP to reflect this. Later, we will make sk_txrehash a part of the union for other protocol families. Note that we need to modify BPF selftest not to get/set SO_TEREHASH for non-TCP sockets. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 5 +++++ tools/testing/selftests/bpf/progs/setget_sockopt.c | 11 +++++++++++ 2 files changed, 16 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 347ce75482f5..d7d6d3a8efe5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1276,6 +1276,8 @@ int sk_setsockopt(struct sock *sk, int level, int optname, return 0; } case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; if (val < -1 || val > 1) return -EINVAL; if ((u8)val == SOCK_TXREHASH_DEFAULT) @@ -2102,6 +2104,9 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; + /* Paired with WRITE_ONCE() in sk_setsockopt() */ v.val = READ_ONCE(sk->sk_txrehash); break; diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c index 0107a24b7522..d330b1511979 100644 --- a/tools/testing/selftests/bpf/progs/setget_sockopt.c +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -83,6 +83,14 @@ struct loop_ctx { struct sock *sk; }; +static bool sk_is_tcp(struct sock *sk) +{ + return (sk->__sk_common.skc_family == AF_INET || + sk->__sk_common.skc_family == AF_INET6) && + sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + static int bpf_test_sockopt_flip(void *ctx, struct sock *sk, const struct sockopt_test *t, int level) @@ -91,6 +99,9 @@ static int bpf_test_sockopt_flip(void *ctx, struct sock *sk, opt = t->opt; + if (opt == SO_TXREHASH && !sk_is_tcp(sk)) + return 0; + if (bpf_getsockopt(ctx, level, opt, &old, sizeof(old))) return 1; /* kernel initialized txrehash to 255 */ -- cgit v1.2.3 From 7d8d93fdde50b86bbbf46a203c368ed320e729ab Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:56 -0700 Subject: net: Restrict SO_PASS{CRED,PIDFD,SEC} to AF_{UNIX,NETLINK,BLUETOOTH}. SCM_CREDENTIALS and SCM_SECURITY can be recv()ed by calling scm_recv() or scm_recv_unix(), and SCM_PIDFD is only used by scm_recv_unix(). scm_recv() is called from AF_NETLINK and AF_BLUETOOTH. scm_recv_unix() is literally called from AF_UNIX. Let's restrict SO_PASSCRED and SO_PASSSEC to such sockets and SO_PASSPIDFD to AF_UNIX only. Later, SOCK_PASS{CRED,PIDFD,SEC} will be moved to struct sock and united with another field. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 14 +++++++++++++- net/core/sock.c | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'net/core/sock.c') diff --git a/include/net/sock.h b/include/net/sock.h index 35ca6b13c6d2..483522377955 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2773,9 +2773,14 @@ static inline bool sk_is_udp(const struct sock *sk) sk->sk_protocol == IPPROTO_UDP; } +static inline bool sk_is_unix(const struct sock *sk) +{ + return sk->sk_family == AF_UNIX; +} + static inline bool sk_is_stream_unix(const struct sock *sk) { - return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; + return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM; } static inline bool sk_is_vsock(const struct sock *sk) @@ -2783,6 +2788,13 @@ static inline bool sk_is_vsock(const struct sock *sk) return sk->sk_family == AF_VSOCK; } +static inline bool sk_may_scm_recv(const struct sock *sk) +{ + return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) || + sk->sk_family == AF_NETLINK || + (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH); +} + /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from diff --git a/net/core/sock.c b/net/core/sock.c index d7d6d3a8efe5..fd5f9d3873c1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1221,12 +1221,21 @@ int sk_setsockopt(struct sock *sk, int level, int optname, } return -EPERM; case SO_PASSSEC: + if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + assign_bit(SOCK_PASSSEC, &sock->flags, valbool); return 0; case SO_PASSCRED: + if (!sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + assign_bit(SOCK_PASSCRED, &sock->flags, valbool); return 0; case SO_PASSPIDFD: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); return 0; case SO_TYPE: @@ -1855,10 +1864,16 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSCRED: + if (!sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; case SO_PASSPIDFD: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); break; @@ -1956,6 +1971,9 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSSEC: + if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); break; -- cgit v1.2.3 From 0e81cfd971dc4833c699dcd8924e54a5021bc4e8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:57 -0700 Subject: af_unix: Move SOCK_PASS{CRED,PIDFD,SEC} to struct sock. As explained in the next patch, SO_PASSRIGHTS would have a problem if we assigned a corresponding bit to socket->flags, so it must be managed in struct sock. Mixing socket->flags and sk->sk_flags for similar options will look confusing, and sk->sk_flags does not have enough space on 32bit system. Also, as mentioned in commit 16e572626961 ("af_unix: dont send SCM_CREDENTIALS by default"), SOCK_PASSCRED and SOCK_PASSPID handling is known to be slow, and managing the flags in struct socket cannot avoid that for embryo sockets. Let's move SOCK_PASS{CRED,PIDFD,SEC} to struct sock. While at it, other SOCK_XXX flags in net.h are grouped as enum. Note that assign_bit() was atomic, so the writer side is moved down after lock_sock() in setsockopt(), but the bit is only read once in sendmsg() and recvmsg(), so lock_sock() is not needed there. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/net.h | 15 +++++++-------- include/net/sock.h | 16 +++++++++++++++- net/core/scm.c | 29 ++++++++++++++--------------- net/core/sock.c | 44 +++++++++++++++++++++++--------------------- net/unix/af_unix.c | 18 ++---------------- 5 files changed, 61 insertions(+), 61 deletions(-) (limited to 'net/core/sock.c') diff --git a/include/linux/net.h b/include/linux/net.h index 0ff950eecc6b..f8418d6e33e0 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -36,14 +36,13 @@ struct net; * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. * Eventually all flags will be in sk->sk_wq->flags. */ -#define SOCKWQ_ASYNC_NOSPACE 0 -#define SOCKWQ_ASYNC_WAITDATA 1 -#define SOCK_NOSPACE 2 -#define SOCK_PASSCRED 3 -#define SOCK_PASSSEC 4 -#define SOCK_SUPPORT_ZC 5 -#define SOCK_CUSTOM_SOCKOPT 6 -#define SOCK_PASSPIDFD 7 +enum socket_flags { + SOCKWQ_ASYNC_NOSPACE, + SOCKWQ_ASYNC_WAITDATA, + SOCK_NOSPACE, + SOCK_SUPPORT_ZC, + SOCK_CUSTOM_SOCKOPT, +}; #ifndef ARCH_HAS_SOCKET_TYPES /** diff --git a/include/net/sock.h b/include/net/sock.h index 483522377955..d90a71f66ab8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -337,6 +337,11 @@ struct sk_filter; * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags + * @sk_scm_recv_flags: all flags used by scm_recv() + * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS + * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY + * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. * @sk_owner: reference to the real owner of the socket that calls @@ -523,7 +528,16 @@ struct sock { #endif int sk_disconnects; - u8 sk_txrehash; + union { + u8 sk_txrehash; + u8 sk_scm_recv_flags; + struct { + u8 sk_scm_credentials : 1, + sk_scm_security : 1, + sk_scm_pidfd : 1, + sk_scm_unused : 5; + }; + }; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, diff --git a/net/core/scm.c b/net/core/scm.c index 66e02b18c359..0225bd94170f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -406,12 +406,12 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) EXPORT_SYMBOL(scm_fp_dup); #ifdef CONFIG_SECURITY_NETWORK -static void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) { struct lsm_context ctx; int err; - if (test_bit(SOCK_PASSSEC, &sock->flags)) { + if (sk->sk_scm_security) { err = security_secid_to_secctx(scm->secid, &ctx); if (err >= 0) { @@ -423,16 +423,16 @@ static void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cooki } } -static bool scm_has_secdata(struct socket *sock) +static bool scm_has_secdata(struct sock *sk) { - return test_bit(SOCK_PASSSEC, &sock->flags); + return sk->sk_scm_security; } #else -static void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) { } -static bool scm_has_secdata(struct socket *sock) +static bool scm_has_secdata(struct sock *sk) { return false; } @@ -474,20 +474,19 @@ static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) fd_install(pidfd, pidfd_file); } -static bool __scm_recv_common(struct socket *sock, struct msghdr *msg, +static bool __scm_recv_common(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!msg->msg_control) { - if (test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags) || - scm->fp || scm_has_secdata(sock)) + if (sk->sk_scm_credentials || sk->sk_scm_pidfd || + scm->fp || scm_has_secdata(sk)) msg->msg_flags |= MSG_CTRUNC; scm_destroy(scm); return false; } - if (test_bit(SOCK_PASSCRED, &sock->flags)) { + if (sk->sk_scm_credentials) { struct user_namespace *current_ns = current_user_ns(); struct ucred ucreds = { .pid = scm->creds.pid, @@ -498,7 +497,7 @@ static bool __scm_recv_common(struct socket *sock, struct msghdr *msg, put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); } - scm_passec(sock, msg, scm); + scm_passec(sk, msg, scm); if (scm->fp) scm_detach_fds(msg, scm); @@ -509,7 +508,7 @@ static bool __scm_recv_common(struct socket *sock, struct msghdr *msg, void scm_recv(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { - if (!__scm_recv_common(sock, msg, scm, flags)) + if (!__scm_recv_common(sock->sk, msg, scm, flags)) return; scm_destroy_cred(scm); @@ -519,10 +518,10 @@ EXPORT_SYMBOL(scm_recv); void scm_recv_unix(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { - if (!__scm_recv_common(sock, msg, scm, flags)) + if (!__scm_recv_common(sock->sk, msg, scm, flags)) return; - if (test_bit(SOCK_PASSPIDFD, &sock->flags)) + if (sock->sk->sk_scm_pidfd) scm_pidfd_recv(msg, scm); scm_destroy_cred(scm); diff --git a/net/core/sock.c b/net/core/sock.c index fd5f9d3873c1..381abf8f25b7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1220,24 +1220,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname, return 0; } return -EPERM; - case SO_PASSSEC: - if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || sk_may_scm_recv(sk)) - return -EOPNOTSUPP; - - assign_bit(SOCK_PASSSEC, &sock->flags, valbool); - return 0; - case SO_PASSCRED: - if (!sk_may_scm_recv(sk)) - return -EOPNOTSUPP; - - assign_bit(SOCK_PASSCRED, &sock->flags, valbool); - return 0; - case SO_PASSPIDFD: - if (!sk_is_unix(sk)) - return -EOPNOTSUPP; - - assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); - return 0; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: @@ -1568,6 +1550,26 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; + case SO_PASSCRED: + if (sk_may_scm_recv(sk)) + sk->sk_scm_credentials = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSSEC: + if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) + sk->sk_scm_security = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSPIDFD: + if (sk_is_unix(sk)) + sk->sk_scm_pidfd = valbool; + else + ret = -EOPNOTSUPP; + break; case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); @@ -1867,14 +1869,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!sk_may_scm_recv(sk)) return -EOPNOTSUPP; - v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); + v.val = sk->sk_scm_credentials; break; case SO_PASSPIDFD: if (!sk_is_unix(sk)) return -EOPNOTSUPP; - v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); + v.val = sk->sk_scm_pidfd; break; case SO_PEERCRED: @@ -1974,7 +1976,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) return -EOPNOTSUPP; - v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); + v.val = sk->sk_scm_security; break; case SO_PEERSEC: diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a39497fd6e98..27ebda4cd9b9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -767,10 +767,7 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) static bool unix_may_passcred(const struct sock *sk) { - struct socket *sock = sk->sk_socket; - - return test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags); + return sk->sk_scm_credentials || sk->sk_scm_pidfd; } static int unix_listen(struct socket *sock, int backlog) @@ -1713,17 +1710,6 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) return 0; } -static void unix_sock_inherit_flags(const struct socket *old, - struct socket *new) -{ - if (test_bit(SOCK_PASSCRED, &old->flags)) - set_bit(SOCK_PASSCRED, &new->flags); - if (test_bit(SOCK_PASSPIDFD, &old->flags)) - set_bit(SOCK_PASSPIDFD, &new->flags); - if (test_bit(SOCK_PASSSEC, &old->flags)) - set_bit(SOCK_PASSSEC, &new->flags); -} - static int unix_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { @@ -1760,7 +1746,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; - unix_sock_inherit_flags(sock, newsock); + tsk->sk_scm_recv_flags = READ_ONCE(sk->sk_scm_recv_flags); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; -- cgit v1.2.3 From 77cbe1a6d8730a07f99f9263c2d5f2304cf5e830 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:59 -0700 Subject: af_unix: Introduce SO_PASSRIGHTS. As long as recvmsg() or recvmmsg() is used with cmsg, it is not possible to avoid receiving file descriptors via SCM_RIGHTS. This behaviour has occasionally been flagged as problematic, as it can be (ab)used to trigger DoS during close(), for example, by passing a FUSE-controlled fd or a hung NFS fd. For instance, as noted on the uAPI Group page [0], an untrusted peer could send a file descriptor pointing to a hung NFS mount and then close it. Once the receiver calls recvmsg() with msg_control, the descriptor is automatically installed, and then the responsibility for the final close() now falls on the receiver, which may result in blocking the process for a long time. Regarding this, systemd calls cmsg_close_all() [1] after each recvmsg() to close() unwanted file descriptors sent via SCM_RIGHTS. However, this cannot work around the issue at all, because the final fput() may still occur on the receiver's side once sendmsg() with SCM_RIGHTS succeeds. Also, even filtering by LSM at recvmsg() does not work for the same reason. Thus, we need a better way to refuse SCM_RIGHTS at sendmsg(). Let's introduce SO_PASSRIGHTS to disable SCM_RIGHTS. Note that this option is enabled by default for backward compatibility. Link: https://uapi-group.org/kernel-features/#disabling-reception-of-scm_rights-for-af_unix-sockets #[0] Link: https://github.com/systemd/systemd/blob/v257.5/src/basic/fd-util.c#L612-L628 #[1] Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ include/net/sock.h | 4 +++- include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 14 ++++++++++++++ net/unix/af_unix.c | 22 ++++++++++++++++++++-- tools/include/uapi/asm-generic/socket.h | 2 ++ 9 files changed, 49 insertions(+), 3 deletions(-) (limited to 'net/core/sock.c') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 3df5f2dd4c0f..8f1f18adcdb5 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -150,6 +150,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 22fa8f19924a..31ac655b7837 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -161,6 +161,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 96831c988606..1f2d5b7a7f5d 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -142,6 +142,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 0x4050 +#define SO_PASSRIGHTS 0x4051 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 5b464a568664..adcba7329386 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -143,6 +143,8 @@ #define SO_RCVPRIORITY 0x005b +#define SO_PASSRIGHTS 0x005c + #if !defined(__KERNEL__) diff --git a/include/net/sock.h b/include/net/sock.h index d90a71f66ab8..92e7c1aae3cc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -341,6 +341,7 @@ struct sk_filter; * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. @@ -535,7 +536,8 @@ struct sock { u8 sk_scm_credentials : 1, sk_scm_security : 1, sk_scm_pidfd : 1, - sk_scm_unused : 5; + sk_scm_rights : 1, + sk_scm_unused : 4; }; }; u8 sk_clockid; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index aa5016ff3d91..f333a0ac4ee4 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -145,6 +145,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/sock.c b/net/core/sock.c index 381abf8f25b7..0cb52e590094 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1571,6 +1571,13 @@ set_sndbuf: ret = -EOPNOTSUPP; break; + case SO_PASSRIGHTS: + if (sk_is_unix(sk)) + sk->sk_scm_rights = valbool; + else + ret = -EOPNOTSUPP; + break; + case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); break; @@ -1879,6 +1886,13 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = sk->sk_scm_pidfd; break; + case SO_PASSRIGHTS: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_rights; + break; + case SO_PEERCRED: { struct ucred peercred; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 900bad88fbd2..bd507f74e35e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1015,6 +1015,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sock_init_data(sock, sk); + sk->sk_scm_rights = 1; sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; @@ -2073,6 +2074,11 @@ restart_locked: goto out_unlock; } + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + err = -EPERM; + goto out_unlock; + } + if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) @@ -2174,9 +2180,13 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { - unix_state_unlock(other); err = -EPIPE; - goto out; + goto out_unlock; + } + + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + err = -EPERM; + goto out_unlock; } unix_maybe_add_creds(skb, sk, other); @@ -2192,6 +2202,8 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, other->sk_data_ready(other); return 0; +out_unlock: + unix_state_unlock(other); out: consume_skb(skb); return err; @@ -2295,6 +2307,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, (other->sk_shutdown & RCV_SHUTDOWN)) goto out_pipe_unlock; + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + unix_state_unlock(other); + err = -EPERM; + goto out_free; + } + unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h index aa5016ff3d91..f333a0ac4ee4 100644 --- a/tools/include/uapi/asm-generic/socket.h +++ b/tools/include/uapi/asm-generic/socket.h @@ -145,6 +145,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) -- cgit v1.2.3