From a948f713842ad5c23f125efc61dee6951893219c Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Tue, 3 Jul 2018 15:05:48 -0500 Subject: nl80211/mac80211: allow non-linear skb in rx_control_port The current implementation of cfg80211_rx_control_port assumed that the caller could provide a contiguous region of memory for the control port frame to be sent up to userspace. Unfortunately, many drivers produce non-linear skbs, especially for data frames. This resulted in userspace getting notified of control port frames with correct metadata (from address, port, etc) yet garbage / nonsense contents, resulting in bad handshakes, disconnections, etc. mac80211 linearizes skbs containing management frames. But it didn't seem worthwhile to do this for control port frames. Thus the signature of cfg80211_rx_control_port was changed to take the skb directly. nl80211 then takes care of obtaining control port frame data directly from the (linear | non-linear) skb. The caller is still responsible for freeing the skb, cfg80211_rx_control_port does not take ownership of it. Fixes: 6a671a50f819 ("nl80211: Add CMD_CONTROL_PORT_FRAME API") Signed-off-by: Denis Kenzior [fix some kernel-doc formatting, add fixes tag] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5fbfe61f41c6..1beb3ead0385 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5835,10 +5835,11 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, /** * cfg80211_rx_control_port - notification about a received control port frame * @dev: The device the frame matched to - * @buf: control port frame - * @len: length of the frame data - * @addr: The peer from which the frame was received - * @proto: frame protocol, typically PAE or Pre-authentication + * @skb: The skbuf with the control port frame. It is assumed that the skbuf + * is 802.3 formatted (with 802.3 header). The skb can be non-linear. + * This function does not take ownership of the skb, so the caller is + * responsible for any cleanup. The caller must also ensure that + * skb->protocol is set appropriately. * @unencrypted: Whether the frame was received unencrypted * * This function is used to inform userspace about a received control port @@ -5851,8 +5852,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, * Return: %true if the frame was passed to userspace */ bool cfg80211_rx_control_port(struct net_device *dev, - const u8 *buf, size_t len, - const u8 *addr, u16 proto, bool unencrypted); + struct sk_buff *skb, bool unencrypted); /** * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event -- cgit v1.2.3 From b4e7a7a88b5d060650094b8d3454bc521d669f6a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 Jun 2018 11:17:54 -0400 Subject: drm_mode_create_lease_ioctl(): fix open-coded filp_clone_open() Failure of ->open() should *not* be followed by fput(). Fixed by using filp_clone_open(), which gets the cleanups right. Cc: stable@vger.kernel.org Acked-by: Linus Torvalds Signed-off-by: Al Viro --- drivers/gpu/drm/drm_lease.c | 16 +--------------- fs/internal.h | 1 - include/linux/fs.h | 1 + 3 files changed, 2 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_lease.c b/drivers/gpu/drm/drm_lease.c index 50c73c0a20b9..d638c0fb3418 100644 --- a/drivers/gpu/drm/drm_lease.c +++ b/drivers/gpu/drm/drm_lease.c @@ -553,24 +553,13 @@ int drm_mode_create_lease_ioctl(struct drm_device *dev, /* Clone the lessor file to create a new file for us */ DRM_DEBUG_LEASE("Allocating lease file\n"); - path_get(&lessor_file->f_path); - lessee_file = alloc_file(&lessor_file->f_path, - lessor_file->f_mode, - fops_get(lessor_file->f_inode->i_fop)); - + lessee_file = filp_clone_open(lessor_file); if (IS_ERR(lessee_file)) { ret = PTR_ERR(lessee_file); goto out_lessee; } - /* Initialize the new file for DRM */ - DRM_DEBUG_LEASE("Initializing the file with %p\n", lessee_file->f_op->open); - ret = lessee_file->f_op->open(lessee_file->f_inode, lessee_file); - if (ret) - goto out_lessee_file; - lessee_priv = lessee_file->private_data; - /* Change the file to a master one */ drm_master_put(&lessee_priv->master); lessee_priv->master = lessee; @@ -588,9 +577,6 @@ int drm_mode_create_lease_ioctl(struct drm_device *dev, DRM_DEBUG_LEASE("drm_mode_create_lease_ioctl succeeded\n"); return 0; -out_lessee_file: - fput(lessee_file); - out_lessee: drm_master_put(&lessee); diff --git a/fs/internal.h b/fs/internal.h index 980d005b21b4..5645b4ebf494 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -127,7 +127,6 @@ int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); -extern struct file *filp_clone_open(struct file *); /* * inode.c diff --git a/include/linux/fs.h b/include/linux/fs.h index 5c91108846db..aa9b4c169ed2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2422,6 +2422,7 @@ extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int, umode_t); extern struct file * dentry_open(const struct path *, int, const struct cred *); +extern struct file *filp_clone_open(struct file *); extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int, int *); -- cgit v1.2.3 From 26b2f552525cf98fad08515bd6faa427f2f22038 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 13 Jul 2018 01:38:08 +0900 Subject: netfilter: nf_tables: fix jumpstack depth validation The level of struct nft_ctx is updated by nf_tables_check_loops(). That is used to validate jumpstack depth. But jumpstack validation routine doesn't update and validate recursively. So, in some cases, chain depth can be bigger than the NFT_JUMP_STACK_SIZE. After this patch, The jumpstack validation routine is located in the nft_chain_validate(). When new rules or new set elements are added, the nft_table_validate() is called by the nf_tables_newrule and the nf_tables_newsetelem. The nft_table_validate() calls the nft_chain_validate() that visit all their children chains recursively. So it can update depth of chain certainly. Reproducer: %cat ./test.sh #!/bin/bash nft add table ip filter nft add chain ip filter input { type filter hook input priority 0\; } for ((i=0;i<20;i++)); do nft add chain ip filter a$i done nft add rule ip filter input jump a1 for ((i=0;i<10;i++)); do nft add rule ip filter a$i jump a$((i+1)) done for ((i=11;i<19;i++)); do nft add rule ip filter a$i jump a$((i+1)) done nft add rule ip filter a10 jump a11 Result: [ 253.931782] WARNING: CPU: 1 PID: 0 at net/netfilter/nf_tables_core.c:186 nft_do_chain+0xacc/0xdf0 [nf_tables] [ 253.931915] Modules linked in: nf_tables nfnetlink ip_tables x_tables [ 253.932153] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.18.0-rc3+ #48 [ 253.932153] RIP: 0010:nft_do_chain+0xacc/0xdf0 [nf_tables] [ 253.932153] Code: 83 f8 fb 0f 84 c7 00 00 00 e9 d0 00 00 00 83 f8 fd 74 0e 83 f8 ff 0f 84 b4 00 00 00 e9 bd 00 00 00 83 bd 64 fd ff ff 0f 76 09 <0f> 0b 31 c0 e9 bc 02 00 00 44 8b ad 64 fd [ 253.933807] RSP: 0018:ffff88011b807570 EFLAGS: 00010212 [ 253.933807] RAX: 00000000fffffffd RBX: ffff88011b807660 RCX: 0000000000000000 [ 253.933807] RDX: 0000000000000010 RSI: ffff880112b39d78 RDI: ffff88011b807670 [ 253.933807] RBP: ffff88011b807850 R08: ffffed0023700ece R09: ffffed0023700ecd [ 253.933807] R10: ffff88011b80766f R11: ffffed0023700ece R12: ffff88011b807898 [ 253.933807] R13: ffff880112b39d80 R14: ffff880112b39d60 R15: dffffc0000000000 [ 253.933807] FS: 0000000000000000(0000) GS:ffff88011b800000(0000) knlGS:0000000000000000 [ 253.933807] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 253.933807] CR2: 00000000014f1008 CR3: 000000006b216000 CR4: 00000000001006e0 [ 253.933807] Call Trace: [ 253.933807] [ 253.933807] ? sched_clock_cpu+0x132/0x170 [ 253.933807] ? __nft_trace_packet+0x180/0x180 [nf_tables] [ 253.933807] ? sched_clock_cpu+0x132/0x170 [ 253.933807] ? debug_show_all_locks+0x290/0x290 [ 253.933807] ? __lock_acquire+0x4835/0x4af0 [ 253.933807] ? inet_ehash_locks_alloc+0x1a0/0x1a0 [ 253.933807] ? unwind_next_frame+0x159e/0x1840 [ 253.933807] ? __read_once_size_nocheck.constprop.4+0x5/0x10 [ 253.933807] ? nft_do_chain_ipv4+0x197/0x1e0 [nf_tables] [ 253.933807] ? nft_do_chain+0x5/0xdf0 [nf_tables] [ 253.933807] nft_do_chain_ipv4+0x197/0x1e0 [nf_tables] [ 253.933807] ? nft_do_chain_arp+0xb0/0xb0 [nf_tables] [ 253.933807] ? __lock_is_held+0x9d/0x130 [ 253.933807] nf_hook_slow+0xc4/0x150 [ 253.933807] ip_local_deliver+0x28b/0x380 [ 253.933807] ? ip_call_ra_chain+0x3e0/0x3e0 [ 253.933807] ? ip_rcv_finish+0x1610/0x1610 [ 253.933807] ip_rcv+0xbcc/0xcc0 [ 253.933807] ? debug_show_all_locks+0x290/0x290 [ 253.933807] ? ip_local_deliver+0x380/0x380 [ 253.933807] ? __lock_is_held+0x9d/0x130 [ 253.933807] ? ip_local_deliver+0x380/0x380 [ 253.933807] __netif_receive_skb_core+0x1c9c/0x2240 Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- net/netfilter/nf_tables_api.c | 11 ++++------- net/netfilter/nft_immediate.c | 3 +++ net/netfilter/nft_lookup.c | 13 +++++++++++-- 4 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 08c005ce56e9..4e82a4c49912 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -150,6 +150,7 @@ static inline void nft_data_debug(const struct nft_data *data) * @portid: netlink portID of the original message * @seq: netlink sequence number * @family: protocol family + * @level: depth of the chains * @report: notify via unicast netlink message */ struct nft_ctx { @@ -160,6 +161,7 @@ struct nft_ctx { u32 portid; u32 seq; u8 family; + u8 level; bool report; }; @@ -865,7 +867,6 @@ enum nft_chain_flags { * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain - * @level: length of longest path to this chain * @flags: bitmask of enum nft_chain_flags * @name: name of the chain */ @@ -878,7 +879,6 @@ struct nft_chain { struct nft_table *table; u64 handle; u32 use; - u16 level; u8 flags:6, genmask:2; char *name; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 896d4a36081d..d41fa2c82f14 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -75,6 +75,7 @@ static void nft_ctx_init(struct nft_ctx *ctx, { ctx->net = net; ctx->family = family; + ctx->level = 0; ctx->table = table; ctx->chain = chain; ctx->nla = nla; @@ -2384,6 +2385,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) struct nft_rule *rule; int err; + if (ctx->level == NFT_JUMP_STACK_SIZE) + return -EMLINK; + list_for_each_entry(rule, &chain->rules, list) { if (!nft_is_active_next(ctx->net, rule)) continue; @@ -6837,13 +6841,6 @@ int nft_validate_register_store(const struct nft_ctx *ctx, err = nf_tables_check_loops(ctx, data->verdict.chain); if (err < 0) return err; - - if (ctx->chain->level + 1 > - data->verdict.chain->level) { - if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE) - return -EMLINK; - data->verdict.chain->level = ctx->chain->level + 1; - } } return 0; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 15adf8ca82c3..0777a93211e2 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -98,6 +98,7 @@ static int nft_immediate_validate(const struct nft_ctx *ctx, const struct nft_data **d) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + struct nft_ctx *pctx = (struct nft_ctx *)ctx; const struct nft_data *data; int err; @@ -109,9 +110,11 @@ static int nft_immediate_validate(const struct nft_ctx *ctx, switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: + pctx->level++; err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; + pctx->level--; break; default: break; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 42e6fadf1417..c2a1d84cdfc4 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -155,7 +155,9 @@ static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_ctx *pctx = (struct nft_ctx *)ctx; const struct nft_data *data; + int err; if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) @@ -165,10 +167,17 @@ static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - return nft_chain_validate(ctx, data->verdict.chain); + pctx->level++; + err = nft_chain_validate(ctx, data->verdict.chain); + if (err < 0) + return err; + pctx->level--; + break; default: - return 0; + break; } + + return 0; } static int nft_lookup_validate(const struct nft_ctx *ctx, -- cgit v1.2.3 From 9ba546c01976a426292af99e682a557075d6c010 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jul 2018 15:48:46 +0200 Subject: aio: don't expose __aio_sigset in uapi glibc uses a different defintion of sigset_t than the kernel does, and the current version would pull in both. To fix this just do not expose the type at all - this somewhat mirrors pselect() where we do not even have a type for the magic sigmask argument, but just use pointer arithmetics. Fixes: 7a074e96 ("aio: implement io_pgetevents") Signed-off-by: Christoph Hellwig Reported-by: Adrian Reber Signed-off-by: Al Viro --- fs/aio.c | 5 +++++ include/linux/syscalls.h | 1 + include/uapi/linux/aio_abi.h | 6 ------ 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/aio.c b/fs/aio.c index e1d20124ec0e..b1a42e45698b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2042,6 +2042,11 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, return ret; } +struct __aio_sigset { + const sigset_t __user *sigmask; + size_t sigsetsize; +}; + SYSCALL_DEFINE6(io_pgetevents, aio_context_t, ctx_id, long, min_nr, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 73810808cdf2..b06b5eeda8e8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -11,6 +11,7 @@ #ifndef _LINUX_SYSCALLS_H #define _LINUX_SYSCALLS_H +struct __aio_sigset; struct epoll_event; struct iattr; struct inode; diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index d00221345c19..ce43d340f010 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -29,7 +29,6 @@ #include #include -#include #include typedef __kernel_ulong_t aio_context_t; @@ -108,10 +107,5 @@ struct iocb { #undef IFBIG #undef IFLITTLE -struct __aio_sigset { - const sigset_t __user *sigmask; - size_t sigsetsize; -}; - #endif /* __LINUX__AIO_ABI_H */ -- cgit v1.2.3 From d7037ad73daa9598b8caa7d5fdf41e8ceee6ef73 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 8 Jul 2018 12:14:59 +0300 Subject: net/mlx5: Fix QP fragmented buffer allocation Fix bad alignment of SQ buffer in fragmented QP allocation. It should start directly after RQ buffer ends. Take special care of the end case where the RQ buffer does not occupy a whole page. RQ size is a power of two, so would be the case only for small RQ sizes (RQ size < PAGE_SIZE). Fix wrong assignments for sqb->size (mistakenly assigned RQ size), and for npages value of RQ and SQ. Fixes: 3a2f70331226 ("net/mlx5: Use order-0 allocations for all WQ types") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/alloc.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/wq.c | 34 ++++++++++++++++--------- include/linux/mlx5/driver.h | 18 ++++++++++--- 3 files changed, 38 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c index 323ffe8bf7e4..456f30007ad6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c @@ -123,7 +123,7 @@ int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size, int i; buf->size = size; - buf->npages = 1 << get_order(size); + buf->npages = DIV_ROUND_UP(size, PAGE_SIZE); buf->page_shift = PAGE_SHIFT; buf->frags = kcalloc(buf->npages, sizeof(struct mlx5_buf_list), GFP_KERNEL); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index b97bb72b4db4..86478a6b99c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -113,35 +113,45 @@ err_db_free: return err; } -static void mlx5e_qp_set_frag_buf(struct mlx5_frag_buf *buf, - struct mlx5_wq_qp *qp) +static void mlx5_qp_set_frag_buf(struct mlx5_frag_buf *buf, + struct mlx5_wq_qp *qp) { + struct mlx5_frag_buf_ctrl *sq_fbc; struct mlx5_frag_buf *rqb, *sqb; - rqb = &qp->rq.fbc.frag_buf; + rqb = &qp->rq.fbc.frag_buf; *rqb = *buf; rqb->size = mlx5_wq_cyc_get_byte_size(&qp->rq); - rqb->npages = 1 << get_order(rqb->size); + rqb->npages = DIV_ROUND_UP(rqb->size, PAGE_SIZE); - sqb = &qp->sq.fbc.frag_buf; - *sqb = *buf; - sqb->size = mlx5_wq_cyc_get_byte_size(&qp->rq); - sqb->npages = 1 << get_order(sqb->size); + sq_fbc = &qp->sq.fbc; + sqb = &sq_fbc->frag_buf; + *sqb = *buf; + sqb->size = mlx5_wq_cyc_get_byte_size(&qp->sq); + sqb->npages = DIV_ROUND_UP(sqb->size, PAGE_SIZE); sqb->frags += rqb->npages; /* first part is for the rq */ + if (sq_fbc->strides_offset) + sqb->frags--; } int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *qpc, struct mlx5_wq_qp *wq, struct mlx5_wq_ctrl *wq_ctrl) { + u32 sq_strides_offset; int err; mlx5_fill_fbc(MLX5_GET(qpc, qpc, log_rq_stride) + 4, MLX5_GET(qpc, qpc, log_rq_size), &wq->rq.fbc); - mlx5_fill_fbc(ilog2(MLX5_SEND_WQE_BB), - MLX5_GET(qpc, qpc, log_sq_size), - &wq->sq.fbc); + + sq_strides_offset = + ((wq->rq.fbc.frag_sz_m1 + 1) % PAGE_SIZE) / MLX5_SEND_WQE_BB; + + mlx5_fill_fbc_offset(ilog2(MLX5_SEND_WQE_BB), + MLX5_GET(qpc, qpc, log_sq_size), + sq_strides_offset, + &wq->sq.fbc); err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node); if (err) { @@ -156,7 +166,7 @@ int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, goto err_db_free; } - mlx5e_qp_set_frag_buf(&wq_ctrl->buf, wq); + mlx5_qp_set_frag_buf(&wq_ctrl->buf, wq); wq->rq.db = &wq_ctrl->db.db[MLX5_RCV_DBR]; wq->sq.db = &wq_ctrl->db.db[MLX5_SND_DBR]; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 80cbb7fdce4a..83957920653a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -358,6 +358,7 @@ struct mlx5_frag_buf_ctrl { struct mlx5_frag_buf frag_buf; u32 sz_m1; u32 frag_sz_m1; + u32 strides_offset; u8 log_sz; u8 log_stride; u8 log_frag_strides; @@ -983,14 +984,22 @@ static inline u32 mlx5_base_mkey(const u32 key) return key & 0xffffff00u; } -static inline void mlx5_fill_fbc(u8 log_stride, u8 log_sz, - struct mlx5_frag_buf_ctrl *fbc) +static inline void mlx5_fill_fbc_offset(u8 log_stride, u8 log_sz, + u32 strides_offset, + struct mlx5_frag_buf_ctrl *fbc) { fbc->log_stride = log_stride; fbc->log_sz = log_sz; fbc->sz_m1 = (1 << fbc->log_sz) - 1; fbc->log_frag_strides = PAGE_SHIFT - fbc->log_stride; fbc->frag_sz_m1 = (1 << fbc->log_frag_strides) - 1; + fbc->strides_offset = strides_offset; +} + +static inline void mlx5_fill_fbc(u8 log_stride, u8 log_sz, + struct mlx5_frag_buf_ctrl *fbc) +{ + mlx5_fill_fbc_offset(log_stride, log_sz, 0, fbc); } static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc, @@ -1004,7 +1013,10 @@ static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc, static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc, u32 ix) { - unsigned int frag = (ix >> fbc->log_frag_strides); + unsigned int frag; + + ix += fbc->strides_offset; + frag = ix >> fbc->log_frag_strides; return fbc->frag_buf.frags[frag].buf + ((fbc->frag_sz_m1 & ix) << fbc->log_stride); -- cgit v1.2.3 From 36fc3c8c282c01ad1570bd864de52f128d731b75 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 19 Jul 2018 22:14:31 -0700 Subject: bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h This patch shrinks the BTF_INT_BITS() mask. The current btf_int_check_meta() ensures the nr_bits of an integer cannot exceed 64. Hence, it is mostly an uapi cleanup. The actual btf usage (i.e. seq_show()) is also modified to use u8 instead of u16. The verification (e.g. btf_int_check_meta()) path stays as is to deal with invalid BTF situation. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 2 +- kernel/bpf/btf.c | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 0b5ddbe135a4..972265f32871 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -76,7 +76,7 @@ struct btf_type { */ #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) -#define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) +#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) /* Attributes stored in the BTF_INT_ENCODING */ #define BTF_INT_SIGNED (1 << 0) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e016ac3afa24..9704934252b3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -450,7 +450,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) */ static bool btf_type_int_is_regular(const struct btf_type *t) { - u16 nr_bits, nr_bytes; + u8 nr_bits, nr_bytes; u32 int_data; int_data = btf_type_int(t); @@ -993,12 +993,16 @@ static void btf_int_bits_seq_show(const struct btf *btf, { u16 left_shift_bits, right_shift_bits; u32 int_data = btf_type_int(t); - u16 nr_bits = BTF_INT_BITS(int_data); - u16 total_bits_offset; - u16 nr_copy_bytes; - u16 nr_copy_bits; + u8 nr_bits = BTF_INT_BITS(int_data); + u8 total_bits_offset; + u8 nr_copy_bytes; + u8 nr_copy_bits; u64 print_num; + /* + * bits_offset is at most 7. + * BTF_INT_OFFSET() cannot exceed 64 bits. + */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); @@ -1028,7 +1032,7 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; - u32 nr_bits = BTF_INT_BITS(int_data); + u8 nr_bits = BTF_INT_BITS(int_data); if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { -- cgit v1.2.3 From b8088dda98b9064a2b3007fe54b03ede70a15602 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 17 Jul 2018 07:17:53 +0200 Subject: netfilter: nf_tables: use dev->name directly no need to store the name in separate area. Furthermore, it uses kmalloc but not kfree and most accesses seem to treat it as char[IFNAMSIZ] not char *. Remove this and use dev->name instead. In case event zeroed dev, just omit the name in the dump. Fixes: d92191aa84e5f1 ("netfilter: nf_tables: cache device name in flowtable object") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 - net/netfilter/nf_tables_api.c | 14 +++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4e82a4c49912..dc417ef0a0c5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1124,7 +1124,6 @@ struct nft_flowtable { u32 genmask:2, use:30; u64 handle; - char *dev_name[NFT_FLOWTABLE_DEVICE_MAX]; /* runtime data below here */ struct nf_hook_ops *ops ____cacheline_aligned; struct nf_flowtable data; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d41fa2c82f14..54a4f75ff9da 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5324,8 +5324,6 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, flowtable->ops[i].priv = &flowtable->data; flowtable->ops[i].hook = flowtable->data.type->hook; flowtable->ops[i].dev = dev_array[i]; - flowtable->dev_name[i] = kstrdup(dev_array[i]->name, - GFP_KERNEL); } return err; @@ -5483,10 +5481,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, err6: i = flowtable->ops_len; err5: - for (k = i - 1; k >= 0; k--) { - kfree(flowtable->dev_name[k]); + for (k = i - 1; k >= 0; k--) nf_unregister_net_hook(net, &flowtable->ops[k]); - } kfree(flowtable->ops); err4: @@ -5585,9 +5581,10 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; for (i = 0; i < flowtable->ops_len; i++) { - if (flowtable->dev_name[i][0] && - nla_put_string(skb, NFTA_DEVICE_NAME, - flowtable->dev_name[i])) + const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev); + + if (dev && + nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -5829,7 +5826,6 @@ static void nft_flowtable_event(unsigned long event, struct net_device *dev, continue; nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); - flowtable->dev_name[i][0] = '\0'; flowtable->ops[i].dev = NULL; break; } -- cgit v1.2.3 From 27cde44a259c380a3c09066fc4b42de7dde9b1ad Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 18 Jul 2018 13:56:35 -0700 Subject: tcp: do not cancel delay-AcK on DCTCP special ACK Currently when a DCTCP receiver delays an ACK and receive a data packet with a different CE mark from the previous one's, it sends two immediate ACKs acking previous and latest sequences respectly (for ECN accounting). Previously sending the first ACK may mark off the delayed ACK timer (tcp_event_ack_sent). This may subsequently prevent sending the second ACK to acknowledge the latest sequence (tcp_ack_snd_check). The culprit is that tcp_send_ack() assumes it always acknowleges the latest sequence, which is not true for the first special ACK. The fix is to not make the assumption in tcp_send_ack and check the actual ack sequence before cancelling the delayed ACK. Further it's safer to pass the ack sequence number as a local variable into tcp_send_ack routine, instead of intercepting tp->rcv_nxt to avoid future bugs like this. Reported-by: Neal Cardwell Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_dctcp.c | 34 ++++------------------------------ net/ipv4/tcp_output.c | 10 +++++++--- 3 files changed, 12 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3482d13d655b..a08de496d1b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -539,6 +539,7 @@ void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 5869f89ca656..078328afbfe3 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -133,21 +133,8 @@ static void dctcp_ce_state_0_to_1(struct sock *sk) * ACK has not sent yet. */ if (!ca->ce_state && - inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { - u32 tmp_rcv_nxt; - - /* Save current rcv_nxt. */ - tmp_rcv_nxt = tp->rcv_nxt; - - /* Generate previous ack with CE=0. */ - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - tp->rcv_nxt = ca->prior_rcv_nxt; - - tcp_send_ack(sk); - - /* Recover current rcv_nxt. */ - tp->rcv_nxt = tmp_rcv_nxt; - } + inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); ca->prior_rcv_nxt = tp->rcv_nxt; ca->ce_state = 1; @@ -164,21 +151,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk) * ACK has not sent yet. */ if (ca->ce_state && - inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { - u32 tmp_rcv_nxt; - - /* Save current rcv_nxt. */ - tmp_rcv_nxt = tp->rcv_nxt; - - /* Generate previous ack with CE=1. */ - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; - tp->rcv_nxt = ca->prior_rcv_nxt; - - tcp_send_ack(sk); - - /* Recover current rcv_nxt. */ - tp->rcv_nxt = tmp_rcv_nxt; - } + inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); ca->prior_rcv_nxt = tp->rcv_nxt; ca->ce_state = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ee1b0705321d..c4172c1fb198 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -160,7 +160,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, + u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); @@ -171,6 +172,9 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } + + if (unlikely(rcv_nxt != tp->rcv_nxt)) + return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -1141,7 +1145,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, icsk->icsk_af_ops->send_check(sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); @@ -3613,12 +3617,12 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) /* Send it off, this clears delayed acks for us. */ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); } +EXPORT_SYMBOL_GPL(__tcp_send_ack); void tcp_send_ack(struct sock *sk) { __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } -EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. -- cgit v1.2.3 From a0496ef2c23b3b180902dd185d0d63ccbc624cf8 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 18 Jul 2018 13:56:36 -0700 Subject: tcp: do not delay ACK in DCTCP upon CE status change Per DCTCP RFC8257 (Section 3.2) the ACK reflecting the CE status change has to be sent immediately so the sender can respond quickly: """ When receiving packets, the CE codepoint MUST be processed as follows: 1. If the CE codepoint is set and DCTCP.CE is false, set DCTCP.CE to true and send an immediate ACK. 2. If the CE codepoint is not set and DCTCP.CE is true, set DCTCP.CE to false and send an immediate ACK. """ Previously DCTCP implementation may continue to delay the ACK. This patch fixes that to implement the RFC by forcing an immediate ACK. Tested with this packetdrill script provided by Larry Brakmo 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 0.000 bind(3, ..., ...) = 0 0.000 listen(3, 1) = 0 0.100 < [ect0] SEW 0:0(0) win 32792 0.100 > SE. 0:0(0) ack 1 0.110 < [ect0] . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_SOCKET, SO_DEBUG, [1], 4) = 0 0.200 < [ect0] . 1:1001(1000) ack 1 win 257 0.200 > [ect01] . 1:1(0) ack 1001 0.200 write(4, ..., 1) = 1 0.200 > [ect01] P. 1:2(1) ack 1001 0.200 < [ect0] . 1001:2001(1000) ack 2 win 257 +0.005 < [ce] . 2001:3001(1000) ack 2 win 257 +0.000 > [ect01] . 2:2(0) ack 2001 // Previously the ACK below would be delayed by 40ms +0.000 > [ect01] E. 2:2(0) ack 3001 +0.500 < F. 9501:9501(0) ack 4 win 257 Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_dctcp.c | 30 ++++++++++++++++++------------ net/ipv4/tcp_input.c | 3 ++- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index a08de496d1b2..25116ec02087 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -342,6 +342,7 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks); static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 078328afbfe3..8b637f9f23a2 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -129,12 +129,15 @@ static void dctcp_ce_state_0_to_1(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - /* State has changed from CE=0 to CE=1 and delayed - * ACK has not sent yet. - */ - if (!ca->ce_state && - inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) - __tcp_send_ack(sk, ca->prior_rcv_nxt); + if (!ca->ce_state) { + /* State has changed from CE=0 to CE=1, force an immediate + * ACK to reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); + tcp_enter_quickack_mode(sk, 1); + } ca->prior_rcv_nxt = tp->rcv_nxt; ca->ce_state = 1; @@ -147,12 +150,15 @@ static void dctcp_ce_state_1_to_0(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - /* State has changed from CE=1 to CE=0 and delayed - * ACK has not sent yet. - */ - if (ca->ce_state && - inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) - __tcp_send_ack(sk, ca->prior_rcv_nxt); + if (ca->ce_state) { + /* State has changed from CE=1 to CE=0, force an immediate + * ACK to reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); + tcp_enter_quickack_mode(sk, 1); + } ca->prior_rcv_nxt = tp->rcv_nxt; ca->ce_state = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8e5522c6833a..6bade06aaf72 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -215,7 +215,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.quick = quickacks; } -static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) +void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -223,6 +223,7 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } +EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. -- cgit v1.2.3 From 3928d4f5ee37cdc523894f6e549e6aae521d8980 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 Jul 2018 13:48:51 -0700 Subject: mm: use helper functions for allocating and freeing vm_area structs The vm_area_struct is one of the most fundamental memory management objects, but the management of it is entirely open-coded evertwhere, ranging from allocation and freeing (using kmem_cache_[z]alloc and kmem_cache_free) to initializing all the fields. We want to unify this in order to end up having some unified initialization of the vmas, and the first step to this is to at least have basic allocation functions. Right now those functions are literally just wrappers around the kmem_cache_*() calls. This is a purely mechanical conversion: # new vma: kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL) -> vm_area_alloc() # copy old vma kmem_cache_alloc(vm_area_cachep, GFP_KERNEL) -> vm_area_dup(old) # free vma kmem_cache_free(vm_area_cachep, vma) -> vm_area_free(vma) to the point where the old vma passed in to the vm_area_dup() function isn't even used yet (because I've left all the old manual initialization alone). Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 4 ++-- arch/ia64/mm/init.c | 8 ++++---- fs/exec.c | 4 ++-- include/linux/mm.h | 4 +++- kernel/fork.c | 21 ++++++++++++++++++--- mm/mmap.c | 22 +++++++++++----------- mm/nommu.c | 8 ++++---- 7 files changed, 44 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 3b38c717008a..e859246badca 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2278,7 +2278,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t DPRINT(("smpl_buf @%p\n", smpl_buf)); /* allocate vma */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { DPRINT(("Cannot allocate vma\n")); goto error_kmem; @@ -2346,7 +2346,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t return 0; error: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); error_kmem: pfm_rvfree(smpl_buf, size); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 18278b448530..3f2321bffb72 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -114,7 +114,7 @@ ia64_init_addr_space (void) * the problem. When the process attempts to write to the register backing store * for the first time, it will get a SEGFAULT in this case. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (vma) { INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; @@ -125,7 +125,7 @@ ia64_init_addr_space (void) down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return; } up_write(¤t->mm->mmap_sem); @@ -133,7 +133,7 @@ ia64_init_addr_space (void) /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (vma) { INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; @@ -144,7 +144,7 @@ ia64_init_addr_space (void) down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return; } up_write(¤t->mm->mmap_sem); diff --git a/fs/exec.c b/fs/exec.c index 2d4e0075bd24..9bd83989ea25 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + bprm->vma = vma = vm_area_alloc(); if (!vma) return -ENOMEM; @@ -326,7 +326,7 @@ err: up_write(&mm->mmap_sem); err_free: bprm->vma = NULL; - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return err; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 3982c83fdcbf..de2fd86c6154 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -155,7 +155,9 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, * mmap() functions). */ -extern struct kmem_cache *vm_area_cachep; +struct vm_area_struct *vm_area_alloc(void); +struct vm_area_struct *vm_area_dup(struct vm_area_struct *); +void vm_area_free(struct vm_area_struct *); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..0e23deb5acfc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -303,11 +303,26 @@ struct kmem_cache *files_cachep; struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ -struct kmem_cache *vm_area_cachep; +static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +struct vm_area_struct *vm_area_alloc(void) +{ + return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); +} + +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); +} + +void vm_area_free(struct vm_area_struct *vma) +{ + kmem_cache_free(vm_area_cachep, vma); +} + static void account_kernel_stack(struct task_struct *tsk, int account) { void *stack = task_stack_page(tsk); @@ -455,7 +470,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -539,7 +554,7 @@ fail_uprobe_end: fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); + vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); diff --git a/mm/mmap.c b/mm/mmap.c index 5801b5f0a634..4286ad2dd1f5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -182,7 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return next; } @@ -911,7 +911,7 @@ again: anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); + vm_area_free(next); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -1729,7 +1729,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { error = -ENOMEM; goto unacct_error; @@ -1832,7 +1832,7 @@ allow_write_and_free_vma: if (vm_flags & VM_DENYWRITE) allow_write_access(file); free_vma: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -2620,7 +2620,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return err; } - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) return -ENOMEM; @@ -2669,7 +2669,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, out_free_mpol: mpol_put(vma_policy(new)); out_free_vma: - kmem_cache_free(vm_area_cachep, new); + vm_area_free(new); return err; } @@ -2984,7 +2984,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; @@ -3202,7 +3202,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { - new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new_vma = vm_area_dup(vma); if (!new_vma) goto out; *new_vma = *vma; @@ -3226,7 +3226,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: - kmem_cache_free(vm_area_cachep, new_vma); + vm_area_free(new_vma); out: return NULL; } @@ -3350,7 +3350,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3376,7 +3376,7 @@ static struct vm_area_struct *__install_special_mapping( return vma; out: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ERR_PTR(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index 4452d8bd9ae4..006e3fe65017 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -769,7 +769,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); put_nommu_region(vma->vm_region); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); } /* @@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file, if (!region) goto error_getting_region; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) goto error_getting_vma; @@ -1368,7 +1368,7 @@ error: kmem_cache_free(vm_region_jar, region); if (vma->vm_file) fput(vma->vm_file); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ret; sharing_violation: @@ -1469,7 +1469,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!region) return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) { kmem_cache_free(vm_region_jar, region); return -ENOMEM; -- cgit v1.2.3 From 490fc053865c9cc40f1085ef8a5504f5341f79d2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 Jul 2018 15:24:03 -0700 Subject: mm: make vm_area_alloc() initialize core fields Like vm_area_dup(), it initializes the anon_vma_chain head, and the basic mm pointer. The rest of the fields end up being different for different users, although the plan is to also initialize the 'vm_ops' field to a dummy entry. Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 4 +--- arch/ia64/mm/init.c | 8 ++------ fs/exec.c | 4 +--- include/linux/mm.h | 2 +- kernel/fork.c | 10 ++++++++-- mm/mmap.c | 12 +++--------- mm/nommu.c | 3 +-- 7 files changed, 17 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index e859246badca..46bff1661836 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2278,17 +2278,15 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t DPRINT(("smpl_buf @%p\n", smpl_buf)); /* allocate vma */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { DPRINT(("Cannot allocate vma\n")); goto error_kmem; } - INIT_LIST_HEAD(&vma->anon_vma_chain); /* * partially initialize the vma for the sampling buffer */ - vma->vm_mm = mm; vma->vm_file = get_file(filp); vma->vm_flags = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 3f2321bffb72..bdb14a369137 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -114,10 +114,8 @@ ia64_init_addr_space (void) * the problem. When the process attempts to write to the register backing store * for the first time, it will get a SEGFAULT in this case. */ - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (vma) { - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = current->mm; vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; @@ -133,10 +131,8 @@ ia64_init_addr_space (void) /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (vma) { - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | diff --git a/fs/exec.c b/fs/exec.c index 9bd83989ea25..72e961a62adb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = vm_area_alloc(); + bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; @@ -298,7 +298,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) err = -EINTR; goto err_free; } - vma->vm_mm = mm; /* * Place the stack at the largest stack address the architecture @@ -311,7 +310,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); if (err) diff --git a/include/linux/mm.h b/include/linux/mm.h index de2fd86c6154..d3a3842316b8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -155,7 +155,7 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, * mmap() functions). */ -struct vm_area_struct *vm_area_alloc(void); +struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); diff --git a/kernel/fork.c b/kernel/fork.c index 67253e41bfb0..a191c05e757d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -308,9 +308,15 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -struct vm_area_struct *vm_area_alloc(void) +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { - return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + + if (vma) { + vma->vm_mm = mm; + INIT_LIST_HEAD(&vma->anon_vma_chain); + } + return vma; } struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) diff --git a/mm/mmap.c b/mm/mmap.c index b0ed8ce1b67e..ff1944d8d458 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1729,19 +1729,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { error = -ENOMEM; goto unacct_error; } - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { if (vm_flags & VM_DENYWRITE) { @@ -2979,14 +2977,12 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* * create a vma struct for an anonymous mapping */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = pgoff; @@ -3343,12 +3339,10 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; diff --git a/mm/nommu.c b/mm/nommu.c index c2560e9cc803..1d22fdbf7d7c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file, if (!region) goto error_getting_region; - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (!vma) goto error_getting_vma; @@ -1212,7 +1212,6 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; -- cgit v1.2.3 From f95de8aa9f824d96421cb7ca81552b4ad8768a31 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 19 Jul 2018 15:56:59 +0800 Subject: bpfilter: Fix mismatch in function argument types Fix following warning: net/ipv4/bpfilter/sockopt.c:28:5: error: symbol 'bpfilter_ip_set_sockopt' redeclared with different type net/ipv4/bpfilter/sockopt.c:34:5: error: symbol 'bpfilter_ip_get_sockopt' redeclared with different type Signed-off-by: YueHaibing Acked-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index 687b1760bb9f..f02cee0225d4 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -5,10 +5,10 @@ #include struct sock; -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval, +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen); -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval, - int *optlen); +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, + int __user *optlen); extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set); -- cgit v1.2.3 From 24b711edfc34bc45777a3f068812b7d1ed004a5d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 19 Jul 2018 12:41:18 -0700 Subject: net/ipv6: Fix linklocal to global address with VRF Example setup: host: ip -6 addr add dev eth1 2001:db8:104::4 where eth1 is enslaved to a VRF switch: ip -6 ro add 2001:db8:104::4/128 dev br1 where br1 only has an LLA ping6 2001:db8:104::4 ssh 2001:db8:104::4 (NOTE: UDP works fine if the PKTINFO has the address set to the global address and ifindex is set to the index of eth1 with a destination an LLA). For ICMP, icmp6_iif needs to be updated to check if skb->dev is an L3 master. If it is then return the ifindex from rt6i_idev similar to what is done for loopback. For TCP, restore the original tcp_v6_iif definition which is needed in most places and add a new tcp_v6_iif_l3_slave that considers the l3_slave variability. This latter check is only needed for socket lookups. Fixes: 9ff74384600a ("net: vrf: Handle ipv6 multicast and link-local addresses") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 5 +++++ net/ipv6/icmp.c | 5 +++-- net/ipv6/tcp_ipv6.c | 6 ++++-- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 25116ec02087..cd3ecda9386a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -840,6 +840,11 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) * as TCP moves IP6CB into a different location in skb->cb[] */ static inline int tcp_v6_iif(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->header.h6.iif; +} + +static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index be491bf6ab6e..ef2505aefc15 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -402,9 +402,10 @@ static int icmp6_iif(const struct sk_buff *skb) /* for local traffic to local address, skb dev is the loopback * device. Check if there is a dst attached to the skb and if so - * get the real device index. + * get the real device index. Same is needed for replies to a link + * local address on a device enslaved to an L3 master device */ - if (unlikely(iif == LOOPBACK_IFINDEX)) { + if (unlikely(iif == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { const struct rt6_info *rt6 = skb_rt6_info(skb); if (rt6) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7efa9fd7e109..03e6b7a2bc53 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -938,7 +938,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, - ntohs(th->source), tcp_v6_iif(skb), + ntohs(th->source), + tcp_v6_iif_l3_slave(skb), tcp_v6_sdif(skb)); if (!sk1) goto out; @@ -1609,7 +1610,8 @@ do_time_wait: skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, - ntohs(th->dest), tcp_v6_iif(skb), + ntohs(th->dest), + tcp_v6_iif_l3_slave(skb), sdif); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); -- cgit v1.2.3 From f88a333b44318643282b8acc92af90deda441f5e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 Jul 2018 15:07:11 +0100 Subject: alpha: fix osf_wait4() breakage kernel_wait4() expects a userland address for status - it's only rusage that goes as a kernel one (and needs a copyout afterwards) [ Also, fix the prototype of kernel_wait4() to have that __user annotation - Linus ] Fixes: 92ebce5ac55d ("osf_wait4: switch to kernel_wait4()") Cc: stable@kernel.org # v4.13+ Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/alpha/kernel/osf_sys.c | 5 +---- include/linux/sched/task.h | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 6e921754c8fc..c210a25dd6da 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1180,13 +1180,10 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) SYSCALL_DEFINE4(osf_wait4, pid_t, pid, int __user *, ustatus, int, options, struct rusage32 __user *, ur) { - unsigned int status = 0; struct rusage r; - long err = kernel_wait4(pid, &status, options, &r); + long err = kernel_wait4(pid, ustatus, options, &r); if (err <= 0) return err; - if (put_user(status, ustatus)) - return -EFAULT; if (!ur) return err; if (put_tv_to_tv32(&ur->ru_utime, &r.ru_utime)) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 5be31eb7b266..108ede99e533 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -75,7 +75,7 @@ extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -extern long kernel_wait4(pid_t, int *, int, struct rusage *); +extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); extern void free_task(struct task_struct *tsk); -- cgit v1.2.3 From e873e4b9cc7e8ce79e5c5627b32b107035bb3f5d Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 21 Jul 2018 20:56:32 -0700 Subject: ipv6: use fib6_info_hold_safe() when necessary In the code path where only rcu read lock is held, e.g. in the route lookup code path, it is not safe to directly call fib6_info_hold() because the fib6_info may already have been deleted but still exists in the rcu grace period. Holding reference to it could cause double free and crash the kernel. This patch adds a new function fib6_info_hold_safe() and replace fib6_info_hold() in all necessary places. Syzbot reported 3 crash traces because of this. One of them is: 8021q: adding VLAN 0 to HW filter on device team0 IPv6: ADDRCONF(NETDEV_CHANGE): team0: link becomes ready dst_release: dst:(____ptrval____) refcnt:-1 dst_release: dst:(____ptrval____) refcnt:-2 WARNING: CPU: 1 PID: 4845 at include/net/dst.h:239 dst_hold include/net/dst.h:239 [inline] WARNING: CPU: 1 PID: 4845 at include/net/dst.h:239 ip6_setup_cork+0xd66/0x1830 net/ipv6/ip6_output.c:1204 dst_release: dst:(____ptrval____) refcnt:-1 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 4845 Comm: syz-executor493 Not tainted 4.18.0-rc3+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 panic+0x238/0x4e7 kernel/panic.c:184 dst_release: dst:(____ptrval____) refcnt:-2 dst_release: dst:(____ptrval____) refcnt:-3 __warn.cold.8+0x163/0x1ba kernel/panic.c:536 dst_release: dst:(____ptrval____) refcnt:-4 report_bug+0x252/0x2d0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] do_error_trap+0x1fc/0x4d0 arch/x86/kernel/traps.c:296 dst_release: dst:(____ptrval____) refcnt:-5 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:316 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992 RIP: 0010:dst_hold include/net/dst.h:239 [inline] RIP: 0010:ip6_setup_cork+0xd66/0x1830 net/ipv6/ip6_output.c:1204 Code: c1 ed 03 89 9d 18 ff ff ff 48 b8 00 00 00 00 00 fc ff df 41 c6 44 05 00 f8 e9 2d 01 00 00 4c 8b a5 c8 fe ff ff e8 1a f6 e6 fa <0f> 0b e9 6a fc ff ff e8 0e f6 e6 fa 48 8b 85 d0 fe ff ff 48 8d 78 RSP: 0018:ffff8801a8fcf178 EFLAGS: 00010293 RAX: ffff8801a8eba5c0 RBX: 0000000000000000 RCX: ffffffff869511e6 RDX: 0000000000000000 RSI: ffffffff869515b6 RDI: 0000000000000005 RBP: ffff8801a8fcf2c8 R08: ffff8801a8eba5c0 R09: ffffed0035ac8338 R10: ffffed0035ac8338 R11: ffff8801ad6419c3 R12: ffff8801a8fcf720 R13: ffff8801a8fcf6a0 R14: ffff8801ad6419c0 R15: ffff8801ad641980 ip6_make_skb+0x2c8/0x600 net/ipv6/ip6_output.c:1768 udpv6_sendmsg+0x2c90/0x35f0 net/ipv6/udp.c:1376 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:641 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:651 ___sys_sendmsg+0x51d/0x930 net/socket.c:2125 __sys_sendmmsg+0x240/0x6f0 net/socket.c:2220 __do_sys_sendmmsg net/socket.c:2249 [inline] __se_sys_sendmmsg net/socket.c:2246 [inline] __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2246 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x446ba9 Code: e8 cc bb 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fb39a469da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00000000006dcc54 RCX: 0000000000446ba9 RDX: 00000000000000b8 RSI: 0000000020001b00 RDI: 0000000000000003 RBP: 00000000006dcc50 R08: 00007fb39a46a700 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 45c828efc7a64843 R13: e6eeb815b9d8a477 R14: 5068caf6f713c6fc R15: 0000000000000001 Dumping ftrace buffer: (ftrace buffer empty) Kernel Offset: disabled Rebooting in 86400 seconds.. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Reported-by: syzbot+902e2a1bcd4f7808cef5@syzkaller.appspotmail.com Reported-by: syzbot+8ae62d67f647abeeceb9@syzkaller.appspotmail.com Reported-by: syzbot+3f08feb14086930677d0@syzkaller.appspotmail.com Signed-off-by: Wei Wang Acked-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 5 +++++ net/ipv6/addrconf.c | 3 ++- net/ipv6/route.c | 41 +++++++++++++++++++++++++++++++---------- 3 files changed, 38 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 71b9043aa0e7..3d4930528db0 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -281,6 +281,11 @@ static inline void fib6_info_hold(struct fib6_info *f6i) atomic_inc(&f6i->fib6_ref); } +static inline bool fib6_info_hold_safe(struct fib6_info *f6i) +{ + return atomic_inc_not_zero(&f6i->fib6_ref); +} + static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 91580c62bb86..f66a1cae3366 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2374,7 +2374,8 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, continue; if ((rt->fib6_flags & noflags) != 0) continue; - fib6_info_hold(rt); + if (!fib6_info_hold_safe(rt)) + continue; break; } out: diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2ce0bd17de4f..ec18b3ce8b6d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -972,10 +972,10 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) rt->dst.lastuse = jiffies; } +/* Caller must already hold reference to @from */ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; - fib6_info_hold(from); rcu_assign_pointer(rt->from, from); dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); if (from->fib6_metrics != &dst_default_metrics) { @@ -984,6 +984,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) } } +/* Caller must already hold reference to @ort */ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) { struct net_device *dev = fib6_info_nh_dev(ort); @@ -1044,9 +1045,14 @@ static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) struct net_device *dev = rt->fib6_nh.nh_dev; struct rt6_info *nrt; + if (!fib6_info_hold_safe(rt)) + return NULL; + nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (nrt) ip6_rt_copy_init(nrt, rt); + else + fib6_info_release(rt); return nrt; } @@ -1178,10 +1184,15 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, * Clone the route. */ + if (!fib6_info_hold_safe(ort)) + return NULL; + dev = ip6_rt_get_dev_rcu(ort); rt = ip6_dst_alloc(dev_net(dev), dev, 0); - if (!rt) + if (!rt) { + fib6_info_release(ort); return NULL; + } ip6_rt_copy_init(rt, ort); rt->rt6i_flags |= RTF_CACHE; @@ -1210,12 +1221,17 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) struct net_device *dev; struct rt6_info *pcpu_rt; + if (!fib6_info_hold_safe(rt)) + return NULL; + rcu_read_lock(); dev = ip6_rt_get_dev_rcu(rt); pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); - if (!pcpu_rt) + if (!pcpu_rt) { + fib6_info_release(rt); return NULL; + } ip6_rt_copy_init(pcpu_rt, rt); pcpu_rt->rt6i_flags |= RTF_PCPU; return pcpu_rt; @@ -2486,7 +2502,7 @@ restart: out: if (ret) - dst_hold(&ret->dst); + ip6_hold_safe(net, &ret, true); else ret = ip6_create_rt_rcu(rt); @@ -3303,7 +3319,8 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) continue; - fib6_info_hold(rt); + if (!fib6_info_hold_safe(rt)) + continue; rcu_read_unlock(); /* if gateway was specified only delete the one hop */ @@ -3409,6 +3426,9 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu rcu_read_lock(); from = rcu_dereference(rt->from); + /* This fib6_info_hold() is safe here because we hold reference to rt + * and rt already holds reference to fib6_info. + */ fib6_info_hold(from); rcu_read_unlock(); @@ -3470,7 +3490,8 @@ static struct fib6_info *rt6_get_route_info(struct net *net, continue; if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) continue; - fib6_info_hold(rt); + if (!fib6_info_hold_safe(rt)) + continue; break; } out: @@ -3530,8 +3551,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) break; } - if (rt) - fib6_info_hold(rt); + if (rt && !fib6_info_hold_safe(rt)) + rt = NULL; rcu_read_unlock(); return rt; } @@ -3579,8 +3600,8 @@ restart: struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && - (!idev || idev->cnf.accept_ra != 2)) { - fib6_info_hold(rt); + (!idev || idev->cnf.accept_ra != 2) && + fib6_info_hold_safe(rt)) { rcu_read_unlock(); ip6_del_rt(net, rt); goto restart; -- cgit v1.2.3